import warnings
from typing import Union
import spacy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from gensim import downloader
from gensim.models import Word2Vec
warnings.filterwarnings('ignore')
# !python -m spacy download en
Параметры¶
file_path = 'parsing_results'
file_name = 'parsing_articles.csv'
Загрузка данных¶
data = pd.read_csv(f'{file_path}/{file_name}')
data
| url | deapth | title | authors | source | number and pages | doi | published | citation | metric | abstract | references | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 0 | Demand driven store site selection via multipl... | ['Mengwen Xu', 'Tianyi Wang', 'Zhengwei Wu', '... | SIGSPACIAL '16: Proceedings of the 24th ACM SI... | Article No.: 40, Pages 1 - 10 | https://doi.org/10.1145/2996913.2996996 | 31 October 2016 | 26 | 617 | Choosing a good location when opening a new st... | ['https://dl.acm.org/doi/10.1016/S0305-0548(01... |
| 1 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | The generalized maximal covering location problem | ['Oded Berman', 'Dmitry Krass'] | Computers and Operations Research | NaN | https://doi.org/10.1016/S0305-0548(01)00079-X | 01 May 2002 | 34 | 0 | We consider a generalization of the maximal co... | [] |
| 2 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Random Forests | ['Leo Breiman'] | Machine Learning | NaN | https://doi.org/10.1023/A:1010933404324 | 01 October 2001 | 9,828 | 0 | Random forests are a combination of tree predi... | ['https://dl.acm.org/doi/10.1162/neco.1997.9.7... |
| 3 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Efficient algorithms for optimal location quer... | ['Zitong Chen', 'Yubao Liu', 'Raymond Chi-Wing... | SIGMOD '14: Proceedings of the 2014 ACM SIGMOD... | NaN | https://doi.org/10.1145/2588555.2612172 | 18 June 2014 | 47 | 790 | In this paper, we study the optimal location q... | ['https://dl.acm.org/doi/10.14778/2350229.2350... |
| 4 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Mean Shift: A Robust Approach Toward Feature S... | ['Dorin Comaniciu', 'Peter Meer'] | IEEE Transactions on Pattern Analysis and Mach... | NaN | https://doi.org/10.1109/34.1000236 | 01 May 2002 | 2,062 | 0 | A general nonparametric technique is proposed ... | ['https://dl.acm.org/doi/10.1007/BF00128233', ... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 254 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Geographical topic discovery and comparison | ['Zhijun Yin', 'Liangliang Cao', 'Jiawei Han',... | WWW '11: Proceedings of the 20th international... | NaN | https://doi.org/10.1145/1963405.1963443 | 28 March 2011 | 232 | 1,642 | This paper studies the problem of discovering ... | ['https://dl.acm.org/doi/10.5555/944919.944937... |
| 255 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Driving with knowledge from the physical world | ['Jing Yuan', 'Yu Zheng', 'Xing Xie', 'Guangzh... | KDD '11: Proceedings of the 17th ACM SIGKDD in... | NaN | https://doi.org/10.1145/2020408.2020462 | 21 August 2011 | 641 | 2,908 | This paper presents a Cloud-based system compu... | ['https://dl.acm.org/doi/10.1016/j.eswa.2008.0... |
| 256 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Where to find my next passenger | ['Jing Yuan', 'Yu Zheng', 'Liuhang Zhang', 'XI... | UbiComp '11: Proceedings of the 13th internati... | NaN | https://doi.org/10.1145/2030112.2030128 | 17 September 2011 | 276 | 2,024 | We present a recommender for taxi drivers and ... | ['https://dl.acm.org/doi/10.1145/304182.304187... |
| 257 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Urban computing with taxicabs | ['Yu Zheng', 'Yanchi Liu', 'Jing Yuan', 'Xing ... | UbiComp '11: Proceedings of the 13th internati... | NaN | https://doi.org/10.1145/2030112.2030126 | 17 September 2011 | 413 | 3,122 | Urban computing for city planning is one of th... | ['https://dl.acm.org/doi/10.5555/645484.656550... |
| 258 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | NaN | [] | NaN | NaN | NaN | NaN | NaN | NaN | NaN | [] |
259 rows × 12 columns
Форматирование содержимого столбцов¶
data['authors'] = data['authors'].apply(eval)
data['references'] = data['references'].apply(eval)
data['published'] = pd.to_datetime(data['published'], format='%d %B %Y')
data['citation'] = data['citation'].replace(',', '', regex=True)
data['citation'] = data['citation'].astype('float')
data['metric'] = data['metric'].replace(',', '', regex=True)
data['metric'] = data['metric'].astype('float')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 259 entries, 0 to 258 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 url 259 non-null object 1 deapth 259 non-null int64 2 title 230 non-null object 3 authors 259 non-null object 4 source 230 non-null object 5 number and pages 9 non-null object 6 doi 187 non-null object 7 published 230 non-null datetime64[ns] 8 citation 232 non-null float64 9 metric 230 non-null float64 10 abstract 223 non-null object 11 references 259 non-null object dtypes: datetime64[ns](1), float64(2), int64(1), object(8) memory usage: 24.4+ KB
Просмотр данных¶
data['url'].value_counts(dropna=False)
url https://dl.acm.org/doi/10.1145/2996913.2996996 175 https://dl.acm.org/doi/10.1145/2487575.2487616 84 Name: count, dtype: int64
data['deapth'].value_counts(dropna=False)
deapth 2 230 1 27 0 2 Name: count, dtype: int64
data['title'].value_counts(dropna=False)
title
NaN 29
Fast training of support vector machines using sequential minimal optimization 4
The generalized maximal covering location problem 3
Discovering regions of different functions in a city using human mobility and POIs 3
Support Vector Machines 3
..
The Cascaded Hough Transform as an Aid in Aerial Image Interpretation 1
A new approach to clustering 1
Pfinder: Real-Time Tracking of the Human Body 1
Region Competition: Unifying Snakes, Region Growing, and Bayes/MDL for Multiband Image Segmentation 1
Urban computing with taxicabs 1
Name: count, Length: 195, dtype: int64
data['authors'].value_counts(dropna=False)
authors
[] 30
[Thorsten Joachims] 4
[Kalervo Järvelin, Jaana Kekäläinen] 4
[John C. Platt] 4
[Daniele Quercia, Neal Lathia, Francesco Calabrese, Giusy Di Lorenzo, Jon Crowcroft] 3
..
[Christopher Richard Wren, Ali Azarbayejani, Trevor Darrell, Alex Paul Pentland] 1
[Song Chun Zhu, Alan Yuille] 1
[Xinhua Zhuang, Yan Huang, K. Palaniappan, Yunxin Zhao] 1
[Zhe Cao, Tao Qin, Tie-Yan Liu, Ming-Feng Tsai, Hang Li] 1
[Yu Zheng, Yanchi Liu, Jing Yuan, Xing Xie] 1
Name: count, Length: 188, dtype: int64
data['source'].value_counts(dropna=False)
source
NaN 29
IEEE Transactions on Pattern Analysis and Machine Intelligence 15
Advances in kernel methods: support vector learning 8
KDD '11: Proceedings of the 17th ACM SIGKDD international conference on Knowledge discovery and data mining 6
ACM Transactions on Information Systems (TOIS) 6
..
SIGMOD '03: Proceedings of the 2003 ACM SIGMOD international conference on Management of data 1
AAAI '98/IAAI '98: Proceedings of the fifteenth national/tenth conference on Artificial intelligence/Innovative applications of artificial intelligence 1
SIGMOD '07: Proceedings of the 2007 ACM SIGMOD international conference on Management of data 1
Numerische Mathematik 1
SIGSPACIAL '16: Proceedings of the 24th ACM SIGSPATIAL International Conference on Advances in Geographic Information Systems 1
Name: count, Length: 129, dtype: int64
data['number and pages'].value_counts(dropna=False)
number and pages NaN 250 Article No.: 5, Pages 1 - 44 2 Article No.: 40, Pages 1 - 10 1 Article No.: 85, Pages 1 - 4 1 Article No.: 38, Pages 1 - 55 1 Article No.: 23, Pages 1 - 27 1 Article No.: 29, Pages 1 - 41 1 Article No.: 2, Pages 1 - 29 1 Article No.: 11, Pages 1 - 10 1 Name: count, dtype: int64
data['doi'].value_counts(dropna=False)
doi
NaN 72
https://doi.org/10.1109/ICDM.2010.152 3
https://doi.org/10.1016/S0305-0548(01)00079-X 3
https://doi.org/10.1145/2339530.2339561 3
https://doi.org/10.1109/5254.708428 3
..
https://doi.org/10.1109/34.790435 1
https://doi.org/10.1109/34.88566 1
https://doi.org/10.1006/cviu.1999.0801 1
https://doi.org/10.1109/83.855433 1
https://doi.org/10.1145/2030112.2030126 1
Name: count, Length: 161, dtype: int64
data['published'].value_counts(dropna=False)
published
NaT 29
1999-02-08 8
1998-07-01 6
2011-08-21 6
2002-05-01 4
..
2011-12-01 1
2011-10-24 1
2003-09-09 1
2014-09-18 1
2016-10-31 1
Name: count, Length: 151, dtype: int64
data['published'].apply(lambda date: date.replace(day=1, month=1)).value_counts(dropna=False)
published NaT 29 2011-01-01 29 1998-01-01 22 1999-01-01 20 2010-01-01 15 2009-01-01 14 2012-01-01 14 2008-01-01 10 2007-01-01 10 2002-01-01 10 2006-01-01 9 1997-01-01 9 2013-01-01 8 2000-01-01 8 2003-01-01 7 2014-01-01 6 1996-01-01 5 1992-01-01 5 2005-01-01 4 1994-01-01 3 1989-01-01 3 2015-01-01 2 2004-01-01 2 1991-01-01 2 1990-01-01 2 2001-01-01 2 1962-01-01 1 1980-01-01 1 1984-01-01 1 2016-01-01 1 1985-01-01 1 1995-01-01 1 1983-01-01 1 1959-01-01 1 1987-01-01 1 Name: count, dtype: int64
data['published'].apply(lambda date: date.replace(day=1, month=1)).value_counts(dropna=True).sort_index().plot();
data['citation'].sort_values()
161 1.0
57 3.0
51 3.0
65 3.0
182 4.0
...
228 NaN
229 NaN
242 NaN
252 NaN
258 NaN
Name: citation, Length: 259, dtype: float64
data['citation'].hist();
data.sort_values(by='citation', ascending=False).head(10)
| url | deapth | title | authors | source | number and pages | doi | published | citation | metric | abstract | references | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 237 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | The WEKA data mining software: an update | [Mark Hall, Eibe Frank, Geoffrey Holmes, Bernh... | ACM SIGKDD Explorations Newsletter | NaN | https://doi.org/10.1145/1656274.1656278 | 2009-11-16 | 13690.0 | 21242.0 | More than twelve years have elapsed since the ... | [https://dl.acm.org/doi/10.5555/998688.1007097... |
| 2 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Random Forests | [Leo Breiman] | Machine Learning | NaN | https://doi.org/10.1023/A:1010933404324 | 2001-10-01 | 9828.0 | 0.0 | Random forests are a combination of tree predi... | [https://dl.acm.org/doi/10.1162/neco.1997.9.7.... |
| 245 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Latent dirichlet allocation | [David M. Blei, Andrew Y. Ng, Michael I. Jordan] | The Journal of Machine Learning Research | NaN | NaN | 2003-03-01 | 7820.0 | 36452.0 | We describe latent Dirichlet allocation (LDA),... | [] |
| 159 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 2 | Latent dirichlet allocation | [David M. Blei, Andrew Y. Ng, Michael I. Jordan] | The Journal of Machine Learning Research | NaN | NaN | 2003-03-01 | 7820.0 | 36452.0 | We describe latent Dirichlet allocation (LDA),... | [] |
| 13 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Scikit-learn: Machine Learning in Python | [Fabian Pedregosa, Gaël Varoquaux, Alexandre G... | The Journal of Machine Learning Research | NaN | NaN | 2011-11-01 | 7648.0 | 19849.0 | Scikit-learn is a Python module integrating a ... | [] |
| 94 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 2 | A training algorithm for optimal margin classi... | [Bernhard E. Boser, Isabelle M. Guyon, Vladimi... | COLT '92: Proceedings of the fifth annual work... | NaN | https://doi.org/10.1145/130385.130401 | 1992-07-01 | 6967.0 | 16072.0 | A training algorithm that maximizes the margin... | [https://dl.acm.org/doi/10.1162/neco.1989.1.1.... |
| 215 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | A training algorithm for optimal margin classi... | [Bernhard E. Boser, Isabelle M. Guyon, Vladimi... | COLT '92: Proceedings of the fifth annual work... | NaN | https://doi.org/10.1145/130385.130401 | 1992-07-01 | 6967.0 | 16072.0 | A training algorithm that maximizes the margin... | [https://dl.acm.org/doi/10.1162/neco.1989.1.1.... |
| 197 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Authoritative sources in a hyperlinked environ... | [Jon M. Kleinberg] | Journal of the ACM (JACM) | NaN | https://doi.org/10.1145/324133.324140 | 1999-09-01 | 5920.0 | 20279.0 | The network structure of a hyperlinked environ... | [] |
| 244 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Probabilistic topic models | [David M. Blei] | Communications of the ACM | NaN | https://doi.org/10.1145/2133806.2133826 | 2012-04-01 | 3659.0 | 142464.0 | Surveying a suite of algorithms that offer a s... | [https://dl.acm.org/doi/10.5555/1795114.179511... |
| 181 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 1 | Cumulated gain-based evaluation of IR techniques | [Kalervo Järvelin, Jaana Kekäläinen] | ACM Transactions on Information Systems (TOIS) | NaN | https://doi.org/10.1145/582415.582418 | 2002-10-01 | 3225.0 | 9826.0 | Modern large retrieval environments tend to ov... | [] |
data['metric'].sort_values()
129 0.0
110 0.0
105 0.0
104 0.0
202 0.0
...
228 NaN
229 NaN
242 NaN
252 NaN
258 NaN
Name: metric, Length: 259, dtype: float64
data['metric'].hist();
data.sort_values(by='metric', ascending=False).head(10)
| url | deapth | title | authors | source | number and pages | doi | published | citation | metric | abstract | references | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 244 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Probabilistic topic models | [David M. Blei] | Communications of the ACM | NaN | https://doi.org/10.1145/2133806.2133826 | 2012-04-01 | 3659.0 | 142464.0 | Surveying a suite of algorithms that offer a s... | [https://dl.acm.org/doi/10.5555/1795114.179511... |
| 159 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 2 | Latent dirichlet allocation | [David M. Blei, Andrew Y. Ng, Michael I. Jordan] | The Journal of Machine Learning Research | NaN | NaN | 2003-03-01 | 7820.0 | 36452.0 | We describe latent Dirichlet allocation (LDA),... | [] |
| 245 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Latent dirichlet allocation | [David M. Blei, Andrew Y. Ng, Michael I. Jordan] | The Journal of Machine Learning Research | NaN | NaN | 2003-03-01 | 7820.0 | 36452.0 | We describe latent Dirichlet allocation (LDA),... | [] |
| 237 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | The WEKA data mining software: an update | [Mark Hall, Eibe Frank, Geoffrey Holmes, Bernh... | ACM SIGKDD Explorations Newsletter | NaN | https://doi.org/10.1145/1656274.1656278 | 2009-11-16 | 13690.0 | 21242.0 | More than twelve years have elapsed since the ... | [https://dl.acm.org/doi/10.5555/998688.1007097... |
| 197 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Authoritative sources in a hyperlinked environ... | [Jon M. Kleinberg] | Journal of the ACM (JACM) | NaN | https://doi.org/10.1145/324133.324140 | 1999-09-01 | 5920.0 | 20279.0 | The network structure of a hyperlinked environ... | [] |
| 13 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Scikit-learn: Machine Learning in Python | [Fabian Pedregosa, Gaël Varoquaux, Alexandre G... | The Journal of Machine Learning Research | NaN | NaN | 2011-11-01 | 7648.0 | 19849.0 | Scikit-learn is a Python module integrating a ... | [] |
| 94 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 2 | A training algorithm for optimal margin classi... | [Bernhard E. Boser, Isabelle M. Guyon, Vladimi... | COLT '92: Proceedings of the fifth annual work... | NaN | https://doi.org/10.1145/130385.130401 | 1992-07-01 | 6967.0 | 16072.0 | A training algorithm that maximizes the margin... | [https://dl.acm.org/doi/10.1162/neco.1989.1.1.... |
| 215 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | A training algorithm for optimal margin classi... | [Bernhard E. Boser, Isabelle M. Guyon, Vladimi... | COLT '92: Proceedings of the fifth annual work... | NaN | https://doi.org/10.1145/130385.130401 | 1992-07-01 | 6967.0 | 16072.0 | A training algorithm that maximizes the margin... | [https://dl.acm.org/doi/10.1162/neco.1989.1.1.... |
| 158 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 2 | Trajectory Data Mining: An Overview | [Yu Zheng] | ACM Transactions on Intelligent Systems and Te... | Article No.: 29, Pages 1 - 41 | https://doi.org/10.1145/2743025 | 2015-05-12 | 1230.0 | 14146.0 | The advances in location-acquisition and mobil... | [https://dl.acm.org/doi/10.1109/ICDE.2008.4497... |
| 234 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Algorithm 97: Shortest path | [Robert W. Floyd] | Communications of the ACM | NaN | https://doi.org/10.1145/367766.368168 | 1962-06-01 | 2828.0 | 11825.0 | NaN | [] |
data['abstract'].value_counts(dropna=False)
abstract
NaN 36
No abstract available. 29
The development of a city gradually fosters different functional regions, such as educational areas and business districts. In this paper, we propose a framework (titled DRoF) that Discovers Regions of different Functions in a city using both human mobility among regions and points of interests (POIs) located in a region. Specifically, we segment a city into disjointed regions according to major roads, such as highways and urban express ways. We infer the functions of each region using a topic-based inference model, which regards a region as a document, a function as a topic, categories of POIs (e.g., restaurants and shopping malls) as metadata (like authors, affiliations, and key words), and human mobility patterns (when people reach/leave a region and where people come from and leave for) as words. As a result, a region is represented by a distribution of functions, and a function is featured by a distribution of mobility patterns. We further identify the intensity of each function in different locations. The results generated by our framework can benefit a variety of applications, including urban planning, location choosing for a business, and social recommendations. We evaluated our method using large-scale and real-world datasets, consisting of two POI datasets of Beijing (in 2010 and 2011) and two 3-month GPS trajectory datasets (representing human mobility) generated by over 12,000 taxicabs in Beijing in 2010 and 2011 respectively. The results justify the advantages of our approach over baseline methods solely using POIs or human mobility. 3
Modern large retrieval environments tend to overwhelm their users by their large output. Since all documents are not of equal relevance to their users, highly relevant documents should be identified and ranked first for presentation. In order to develop IR techniques in this direction, it is necessary to develop evaluation approaches and methods that credit IR methods for their ability to retrieve highly relevant documents. This can be done by extending traditional evaluation methods, that is, recall and precision based on binary relevance judgments, to graded relevance judgments. Alternatively, novel measures based on graded relevance judgments may be developed. This article proposes several novel measures that compute the cumulative gain the user obtains by examining the retrieval result up to a given ranked position. The first one accumulates the relevance scores of retrieved documents along the ranked result list. The second one is similar but applies a discount factor to the relevance scores in order to devaluate late-retrieved documents. The third one computes the relative-to-the-ideal performance of IR techniques, based on the cumulative gain they are able to yield. These novel measures are defined and discussed and their use is demonstrated in a case study using TREC data: sample system run results for 20 queries in TREC-7. As a relevance base we used novel graded relevance judgments on a four-point scale. The test results indicate that the proposed measures credit IR methods for their ability to retrieve highly relevant documents and allow testing of statistical significance of effectiveness differences. The graphs based on the measures also provide insight into the performance IR techniques and allow interpretation, for example, from the user point of view. 3
A city offers thousands of social events a day, and it is difficult for dwellers to make choices. The combination of mobile phones and recommender systems can change the way one deals with such abundance. Mobile phones with positioning technology are now widely available, making it easy for people to broadcast their whereabouts, recommender systems can now identify patterns in people’s movements in order to, for example, recommend events. To do so, the system relies on having mobile users who share their attendance at a large number of social events: cold-start users, who have no location history, cannot receive recommendations. We set out to address the mobile cold-start problem by answering the following research question: how can social events be recommended to a cold-start user based only on his home location? To answer this question, we carry out a study of the relationship between preferences for social events and geography, the first of its kind in a large metropolitan area. We sample location estimations of one million mobile phone users in Greater Boston, combine the sample with social events in the same area, and infer the social events attended by 2,519 residents. Upon this data, we test a variety of algorithms for recommending social events. We find that the most effective algorithm recommends events that are popular among residents of an area. The least effective, instead, recommends events that are geographically close to the area. This last result has interesting implications for location-based services that emphasize recommending nearby events. 3
..
Cartography and other applications of remote sensing have led to an increased interest in the(semi-)automatic interpretation of structures in aerial images of urban and suburban areas. Although these areas are particularly challenging because of their complexity, the degree of regularity in such man-made structures also helps to tackle the problems. The paper presents the iterated application of the Hough transform as a means to exploit such regularities. It shows how such "Cascaded Hough Transform"(or CHT for short) yields straight lines, vanishing points, and vanishing lines. It also illustrates how the latter assist in improving the precision of the former. The examples are based on real aerial photographs. 1
Pfinder is a real-time system for tracking people and interpreting their behavior. It runs at 10Hz on a standard SGI Indy computer, and has performed reliably on thousands of people in many different physical locations. The system uses a multiclass statistical model of color and shape to obtain a 2D representation of head and hands in a wide range of viewing conditions. Pfinder has been successfully used in a wide range of applications including wireless interfaces, video databases, and low-bandwidth coding. 1
We present a novel statistical and variational approach to image segmentation based on a new algorithm named region competition. This algorithm is derived by minimizing a generalized Bayes/MDL criterion using the variational principle. The algorithm is guaranteed to converge to a local minimum and combines aspects of snakes/balloons and region growing. Indeed the classic snakes/balloons and region growing algorithms can be directly derived from our approach. We provide theoretical analysis of region competition including accuracy of boundary location, criteria for initial conditions, and the relationship to edge detection using filters. It is straightforward to generalize the algorithm to multiband segmentation and we demonstrate it on gray level images, color images and texture images. The novel color model allows us to eliminate intensity gradients and shadows, thereby obtaining segmentation based on the albedos of objects. It also helps detect highlight regions. 1
We present a new approach to the modeling and decomposition of Gaussian mixtures by using robust statistical methods. The mixture distribution is viewed as a contaminated Gaussian density. Using this model and the model-fitting (MF) estimator, we propose a recursive algorithm called the Gaussian mixture density decomposition (GMDD) algorithm for successively identifying each Gaussian component in the mixture. The proposed decomposition scheme has advantages that are desirable but lacking in most existing techniques. In the GMDD algorithm the number of components does not need to be specified a priori, the proportion of noisy data in the mixture can be large, the parameter estimation of each component is virtually initial independent, and the variability in the shape and size of the component densities in the mixture is taken into account. Gaussian mixture density modeling and decomposition has been widely applied in a variety of disciplines that require signal or waveform characterization for classification and recognition. We apply the proposed GMDD algorithm to the identification and extraction of clusters, and the estimation of unknown probability densities. Probability density estimation by identifying a decomposition using the GMDD algorithm, that is, a superposition of normal distributions, is successfully applied to automated cell classification. Computer experiments using both real data and simulated data demonstrate the validity and power of the GMDD algorithm for various models and different noise assumptions 1
Urban computing for city planning is one of the most significant applications in Ubiquitous computing. In this paper we detect flawed urban planning using the GPS trajectories of taxicabs traveling in urban areas. The detected results consist of 1) pairs of regions with salient traffic problems and 2) the linking structure as well as correlation among them. These results can evaluate the effectiveness of the carried out planning, such as a newly built road and subway lines in a city, and remind city planners of a problem that has not been recognized when they conceive future plans. We conduct our method using the trajectories generated by 30,000 taxis from March to May in 2009 and 2010 in Beijing, and evaluate our results with the real urban planning of Beijing. 1
Name: count, Length: 172, dtype: int64
data['references'].value_counts(dropna=False)
references
[] 122
[https://dl.acm.org/doi/10.1145/2133806.2133826, https://dl.acm.org/doi/10.5555/944919.944937, https://dl.acm.org/doi/10.1145/2020408.2020523, https://dl.acm.org/doi/10.1145/1835804.1835918, https://dl.acm.org/doi/10.1109/34.161346, https://dl.acm.org/doi/10.1145/2020408.2020571, https://dl.acm.org/doi/10.1145/2063212.2063223, https://dl.acm.org/doi/10.1016/0377-0427(87)90125-7, https://dl.acm.org/doi/10.5555/558008, https://dl.acm.org/doi/10.1145/1999320.1999331, https://dl.acm.org/doi/10.1145/1963405.1963443, https://dl.acm.org/doi/10.1145/2020408.2020462, https://dl.acm.org/doi/10.1145/2030112.2030128, https://dl.acm.org/doi/10.1145/2030112.2030126, https://dl.acm.org/doi/10.5555/2124413] 3
[https://dl.acm.org/doi/10.1145/130385.130401, https://dl.acm.org/doi/10.5555/211359, https://dl.acm.org/doi/10.1162/089976698300017467, https://dl.acm.org/doi/10.5555/299094, https://dl.acm.org/doi/10.5555/302528.302628, https://dl.acm.org/doi/10.5555/889153, https://dl.acm.org/doi/10.1023/A:1009982220290, https://dl.acm.org/doi/10.5555/645326.649721, https://dl.acm.org/doi/10.1145/288627.288651, https://dl.acm.org/doi/10.5555/576628, https://dl.acm.org/doi/10.5555/299094.299105, https://dl.acm.org/doi/10.1016/0167-8655(94)90027-2, https://dl.acm.org/doi/10.5555/929901, https://dl.acm.org/doi/10.5555/888836, https://dl.acm.org/doi/10.5555/1098680, https://dl.acm.org/doi/10.1023/A:1009715923555, https://dl.acm.org/doi/10.5555/299094.299105, https://dl.acm.org/doi/10.5555/299094.299103, https://dl.acm.org/doi/10.5555/299094.299104] 3
[https://dl.acm.org/doi/10.5555/303568.303903, https://dl.acm.org/doi/10.5555/525960, https://dl.acm.org/doi/10.5555/646256.684894, https://dl.acm.org/doi/10.1145/130385.130401, https://dl.acm.org/doi/10.5555/3091696.3091706, https://dl.acm.org/doi/10.5555/299094.299100, https://dl.acm.org/doi/10.5555/2998981.2999003, https://dl.acm.org/doi/10.5555/39857, https://dl.acm.org/doi/10.1162/neco.1992.4.1.1, https://dl.acm.org/doi/10.1162/089976698300017269, https://dl.acm.org/doi/10.5555/5509, https://dl.acm.org/doi/10.5555/299094.299103, https://dl.acm.org/doi/10.5555/2980, https://dl.acm.org/doi/10.5555/1196925, https://dl.acm.org/doi/10.5555/646257.685538, https://dl.acm.org/doi/10.5555/794189.794466, https://dl.acm.org/doi/10.5555/148286, https://dl.acm.org/doi/10.5555/646256.684746, https://dl.acm.org/doi/10.5555/302528.302764, https://dl.acm.org/doi/10.1162/089976698300017467, https://dl.acm.org/doi/10.1109/78.650102, https://dl.acm.org/doi/10.1145/238061.238070, https://dl.acm.org/doi/10.1016/S0893-6080(98)00032-X, https://dl.acm.org/doi/10.5555/1098680, https://dl.acm.org/doi/10.5555/211359, https://dl.acm.org/doi/10.5555/211359, https://dl.acm.org/doi/10.5555/299094.299099] 2
[https://dl.acm.org/doi/10.1162/neco.1989.1.1.151, https://dl.acm.org/doi/10.5555/109230.109279, https://dl.acm.org/doi/10.1162/neco.1992.4.1.1, https://dl.acm.org/doi/10.5555/118850.118983, https://dl.acm.org/doi/10.5555/1098680] 2
...
[https://dl.acm.org/doi/10.1109/34.295913, https://dl.acm.org/doi/10.5555/30394, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767851, https://dl.acm.org/doi/10.1016/1049-9660(91)90028-N, https://dl.acm.org/doi/10.5555/92131, https://dl.acm.org/doi/10.1007/BF00115697, https://dl.acm.org/doi/10.1109/TPAMI.1984.4767596, https://dl.acm.org/doi/10.1109/34.56204, https://dl.acm.org/doi/10.1109/TPAMI.1984.4767505, https://dl.acm.org/doi/10.5555/59551, https://dl.acm.org/doi/10.1007/BF00137441, https://dl.acm.org/doi/10.1137/0731015, https://dl.acm.org/doi/10.1109/34.50626, https://dl.acm.org/doi/10.1007/BF01679685, https://dl.acm.org/doi/10.5555/193183, https://dl.acm.org/doi/10.1007/BF00127812, https://dl.acm.org/doi/10.1016/0005-1098(78)90005-5, https://dl.acm.org/doi/10.5555/534247, https://dl.acm.org/doi/10.1007/BF01427153, https://dl.acm.org/doi/10.1016/0031-3203(89)90010-1, https://dl.acm.org/doi/10.5555/889385, https://dl.acm.org/doi/10.1109/TPAMI.1984.4767599, https://dl.acm.org/doi/10.5555/247372.247375, https://dl.acm.org/doi/10.5555/794190.794527] 1
[https://dl.acm.org/doi/10.1109/34.387503, https://dl.acm.org/doi/10.1109/34.334396, https://dl.acm.org/doi/10.1109/34.391395, https://dl.acm.org/doi/10.1007/s005300050046, https://dl.acm.org/doi/10.1109/34.216727, https://dl.acm.org/doi/10.1109/34.85661, https://dl.acm.org/doi/10.5555/200241.200246, https://dl.acm.org/doi/10.1109/34.531801] 1
[https://dl.acm.org/doi/10.1109/TPAMI.1986.4767747, https://dl.acm.org/doi/10.1109/TPAMI.1987.4767980, https://dl.acm.org/doi/10.5555/30394, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767851, https://dl.acm.org/doi/10.5555/59861, https://dl.acm.org/doi/10.1109/TPAMI.1984.4767596, https://dl.acm.org/doi/10.1109/TPAMI.1984.4767475, https://dl.acm.org/doi/10.5555/6519, https://dl.acm.org/doi/10.5555/1095712, https://dl.acm.org/doi/10.1016/S0734-189X(87)80181-0, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767841, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767852, https://dl.acm.org/doi/10.1109/2.74, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767748] 1
[https://dl.acm.org/doi/10.1109/TPAMI.1986.4767749, https://dl.acm.org/doi/10.5555/30394, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767851, https://dl.acm.org/doi/10.1109/34.6782, https://dl.acm.org/doi/10.1109/TPAMI.1984.4767596, https://dl.acm.org/doi/10.1016/S0734-189X(87)80153-6, https://dl.acm.org/doi/10.5555/1623516.1623607, https://dl.acm.org/doi/10.1109/TPAMI.1986.4767748] 1
[https://dl.acm.org/doi/10.5555/645484.656550, https://dl.acm.org/doi/10.1145/1864349.1864380, https://dl.acm.org/doi/10.1007/s00779-005-0046-3, https://dl.acm.org/doi/10.1109/CSE.2009.91, https://dl.acm.org/doi/10.1145/1835804.1835918, https://dl.acm.org/doi/10.1109/MPRV.2007.57, https://dl.acm.org/doi/10.1145/1463434.1463477, https://dl.acm.org/doi/10.1145/321556.321570, https://dl.acm.org/doi/10.1109/MC.2006.308, https://dl.acm.org/doi/10.1145/1869790.1869807, https://dl.acm.org/doi/10.1145/1409635.1409678, https://dl.acm.org/doi/10.1145/1921591.1921596] 1
Name: count, Length: 122, dtype: int64
Очистка данных¶
data = data[data['title'].notna()]
data.reset_index(drop=True, inplace=True)
data = data[data['authors'].apply(len) > 0]
data.reset_index(drop=True, inplace=True)
data = data[data['source'].notna()]
data.reset_index(drop=True, inplace=True)
data = data[data['published'].notna()]
data.reset_index(drop=True, inplace=True)
data = data[(data['abstract'].notna()) & (data['abstract'] != 'No abstract available.')]
data.reset_index(drop=True, inplace=True)
data = data.drop_duplicates(
subset=[
'title',
# 'authors',
'source',
'number and pages',
'doi',
'published',
'citation',
'metric',
'abstract',
# 'references'
]
)
data.reset_index(drop=True, inplace=True)
data
| url | deapth | title | authors | source | number and pages | doi | published | citation | metric | abstract | references | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 0 | Demand driven store site selection via multipl... | [Mengwen Xu, Tianyi Wang, Zhengwei Wu, Jingbo ... | SIGSPACIAL '16: Proceedings of the 24th ACM SI... | Article No.: 40, Pages 1 - 10 | https://doi.org/10.1145/2996913.2996996 | 2016-10-31 | 26.0 | 617.0 | Choosing a good location when opening a new st... | [https://dl.acm.org/doi/10.1016/S0305-0548(01)... |
| 1 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | The generalized maximal covering location problem | [Oded Berman, Dmitry Krass] | Computers and Operations Research | NaN | https://doi.org/10.1016/S0305-0548(01)00079-X | 2002-05-01 | 34.0 | 0.0 | We consider a generalization of the maximal co... | [] |
| 2 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Random Forests | [Leo Breiman] | Machine Learning | NaN | https://doi.org/10.1023/A:1010933404324 | 2001-10-01 | 9828.0 | 0.0 | Random forests are a combination of tree predi... | [https://dl.acm.org/doi/10.1162/neco.1997.9.7.... |
| 3 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Efficient algorithms for optimal location quer... | [Zitong Chen, Yubao Liu, Raymond Chi-Wing Wong... | SIGMOD '14: Proceedings of the 2014 ACM SIGMOD... | NaN | https://doi.org/10.1145/2588555.2612172 | 2014-06-18 | 47.0 | 790.0 | In this paper, we study the optimal location q... | [https://dl.acm.org/doi/10.14778/2350229.23502... |
| 4 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Mean Shift: A Robust Approach Toward Feature S... | [Dorin Comaniciu, Peter Meer] | IEEE Transactions on Pattern Analysis and Mach... | NaN | https://doi.org/10.1109/34.1000236 | 2002-05-01 | 2062.0 | 0.0 | A general nonparametric technique is proposed ... | [https://dl.acm.org/doi/10.1007/BF00128233, ht... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 164 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Machine learning approaches for high-resolutio... | [Ranga Raju Vatsavai, Eddie Bright, Chandola V... | COM.Geo '11: Proceedings of the 2nd Internatio... | Article No.: 11, Pages 1 - 10 | https://doi.org/10.1145/1999320.1999331 | 2011-05-23 | 18.0 | 526.0 | The proliferation of several machine learning ... | [https://dl.acm.org/doi/10.5555/1191551.119179... |
| 165 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Geographical topic discovery and comparison | [Zhijun Yin, Liangliang Cao, Jiawei Han, Cheng... | WWW '11: Proceedings of the 20th international... | NaN | https://doi.org/10.1145/1963405.1963443 | 2011-03-28 | 232.0 | 1642.0 | This paper studies the problem of discovering ... | [https://dl.acm.org/doi/10.5555/944919.944937,... |
| 166 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Driving with knowledge from the physical world | [Jing Yuan, Yu Zheng, Xing Xie, Guangzhong Sun] | KDD '11: Proceedings of the 17th ACM SIGKDD in... | NaN | https://doi.org/10.1145/2020408.2020462 | 2011-08-21 | 641.0 | 2908.0 | This paper presents a Cloud-based system compu... | [https://dl.acm.org/doi/10.1016/j.eswa.2008.07... |
| 167 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Where to find my next passenger | [Jing Yuan, Yu Zheng, Liuhang Zhang, XIng Xie,... | UbiComp '11: Proceedings of the 13th internati... | NaN | https://doi.org/10.1145/2030112.2030128 | 2011-09-17 | 276.0 | 2024.0 | We present a recommender for taxi drivers and ... | [https://dl.acm.org/doi/10.1145/304182.304187,... |
| 168 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Urban computing with taxicabs | [Yu Zheng, Yanchi Liu, Jing Yuan, Xing Xie] | UbiComp '11: Proceedings of the 13th internati... | NaN | https://doi.org/10.1145/2030112.2030126 | 2011-09-17 | 413.0 | 3122.0 | Urban computing for city planning is one of th... | [https://dl.acm.org/doi/10.5555/645484.656550,... |
169 rows × 12 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 169 entries, 0 to 168 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 url 169 non-null object 1 deapth 169 non-null int64 2 title 169 non-null object 3 authors 169 non-null object 4 source 169 non-null object 5 number and pages 8 non-null object 6 doi 142 non-null object 7 published 169 non-null datetime64[ns] 8 citation 169 non-null float64 9 metric 169 non-null float64 10 abstract 169 non-null object 11 references 169 non-null object dtypes: datetime64[ns](1), float64(2), int64(1), object(8) memory usage: 16.0+ KB
Предобработка аннотаций¶
Лемматизация — приведение словоформы к лемме — её нормальной (словарной) форме. Например: существительные в форму единственного числа, именительного падежа.
Стемминг — нахождение основы слова для заданного исходного слова. Например: выделение корня слова.
nlp = spacy.load('en_core_web_sm')
def process_text(text: str) -> str:
text = text.lower()
digits = '0123456789'
for digit in digits:
text = text.replace(digit, '')
tokens = nlp(text)
tokens = [token for token in tokens if token.is_punct is False and token.is_stop is False]
tokens = [token.lemma_ for token in tokens]
result = ' '.join(tokens)
return result
data['abstract'][0]
'Choosing a good location when opening a new store is crucial for the future success of a business. Traditional methods include offline manual survey, analytic models based on census data, which are either unable to adapt to the dynamic market or very time consuming. The rapid increase of the availability of big data from various types of mobile devices, such as online query data and offline positioning data, provides us with the possibility to develop automatic and accurate data- driven prediction models for business store site selection. In this paper, we propose a Demand Driven Store Site Selection (DD3S) framework for business store site selection by mining search query data from Baidu Maps. DD3S first detects the spatial-temporal distributions of customer demands on different business services via query data from Baidu Maps, the largest online map search engine in China, and detects the gaps between demand and supply. Then we determine candidate locations via clustering such gaps. In the final stage, we solve the location optimization problem by predicting and ranking the number of customers. We not only deploy supervised regression models to predict the number of customers, but also use learning-to-rank model to directly rank the locations. We evaluate our framework on various types of businesses in real-world cases, and the experiment results demonstrate the effectiveness of our methods. DD3S as the core function for store site selection has already been implemented as a core component of our business analytics platform and could be potentially used by chain store merchants on Baidu Nuomi.'
process_text(data['abstract'][0])
'choose good location open new store crucial future success business traditional method include offline manual survey analytic model base census datum unable adapt dynamic market time consume rapid increase availability big datum type mobile device online query datum offline positioning datum provide possibility develop automatic accurate data- drive prediction model business store site selection paper propose demand drive store site selection dd framework business store site selection mining search query datum baidu map dd detect spatial temporal distribution customer demand different business service query datum baidu map large online map search engine china detect gap demand supply determine candidate location cluster gap final stage solve location optimization problem predict rank number customer deploy supervised regression model predict number customer use learn rank model directly rank location evaluate framework type business real world case experiment result demonstrate effectiveness method dd core function store site selection implement core component business analytic platform potentially chain store merchant baidu nuomi'
data['process_abstract'] = data['abstract'].apply(lambda row: process_text(row))
data
| url | deapth | title | authors | source | number and pages | doi | published | citation | metric | abstract | references | process_abstract | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 0 | Demand driven store site selection via multipl... | [Mengwen Xu, Tianyi Wang, Zhengwei Wu, Jingbo ... | SIGSPACIAL '16: Proceedings of the 24th ACM SI... | Article No.: 40, Pages 1 - 10 | https://doi.org/10.1145/2996913.2996996 | 2016-10-31 | 26.0 | 617.0 | Choosing a good location when opening a new st... | [https://dl.acm.org/doi/10.1016/S0305-0548(01)... | choose good location open new store crucial fu... |
| 1 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | The generalized maximal covering location problem | [Oded Berman, Dmitry Krass] | Computers and Operations Research | NaN | https://doi.org/10.1016/S0305-0548(01)00079-X | 2002-05-01 | 34.0 | 0.0 | We consider a generalization of the maximal co... | [] | consider generalization maximal cover location... |
| 2 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Random Forests | [Leo Breiman] | Machine Learning | NaN | https://doi.org/10.1023/A:1010933404324 | 2001-10-01 | 9828.0 | 0.0 | Random forests are a combination of tree predi... | [https://dl.acm.org/doi/10.1162/neco.1997.9.7.... | random forest combination tree predictor tree ... |
| 3 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Efficient algorithms for optimal location quer... | [Zitong Chen, Yubao Liu, Raymond Chi-Wing Wong... | SIGMOD '14: Proceedings of the 2014 ACM SIGMOD... | NaN | https://doi.org/10.1145/2588555.2612172 | 2014-06-18 | 47.0 | 790.0 | In this paper, we study the optimal location q... | [https://dl.acm.org/doi/10.14778/2350229.23502... | paper study optimal location query problem bas... |
| 4 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Mean Shift: A Robust Approach Toward Feature S... | [Dorin Comaniciu, Peter Meer] | IEEE Transactions on Pattern Analysis and Mach... | NaN | https://doi.org/10.1109/34.1000236 | 2002-05-01 | 2062.0 | 0.0 | A general nonparametric technique is proposed ... | [https://dl.acm.org/doi/10.1007/BF00128233, ht... | general nonparametric technique propose analys... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 164 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Machine learning approaches for high-resolutio... | [Ranga Raju Vatsavai, Eddie Bright, Chandola V... | COM.Geo '11: Proceedings of the 2nd Internatio... | Article No.: 11, Pages 1 - 10 | https://doi.org/10.1145/1999320.1999331 | 2011-05-23 | 18.0 | 526.0 | The proliferation of several machine learning ... | [https://dl.acm.org/doi/10.5555/1191551.119179... | proliferation machine learning approach make d... |
| 165 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Geographical topic discovery and comparison | [Zhijun Yin, Liangliang Cao, Jiawei Han, Cheng... | WWW '11: Proceedings of the 20th international... | NaN | https://doi.org/10.1145/1963405.1963443 | 2011-03-28 | 232.0 | 1642.0 | This paper studies the problem of discovering ... | [https://dl.acm.org/doi/10.5555/944919.944937,... | paper study problem discover compare geographi... |
| 166 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Driving with knowledge from the physical world | [Jing Yuan, Yu Zheng, Xing Xie, Guangzhong Sun] | KDD '11: Proceedings of the 17th ACM SIGKDD in... | NaN | https://doi.org/10.1145/2020408.2020462 | 2011-08-21 | 641.0 | 2908.0 | This paper presents a Cloud-based system compu... | [https://dl.acm.org/doi/10.1016/j.eswa.2008.07... | paper present cloud base system computing cust... |
| 167 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Where to find my next passenger | [Jing Yuan, Yu Zheng, Liuhang Zhang, XIng Xie,... | UbiComp '11: Proceedings of the 13th internati... | NaN | https://doi.org/10.1145/2030112.2030128 | 2011-09-17 | 276.0 | 2024.0 | We present a recommender for taxi drivers and ... | [https://dl.acm.org/doi/10.1145/304182.304187,... | present recommender taxi driver people expect ... |
| 168 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Urban computing with taxicabs | [Yu Zheng, Yanchi Liu, Jing Yuan, Xing Xie] | UbiComp '11: Proceedings of the 13th internati... | NaN | https://doi.org/10.1145/2030112.2030126 | 2011-09-17 | 413.0 | 3122.0 | Urban computing for city planning is one of th... | [https://dl.acm.org/doi/10.5555/645484.656550,... | urban computing city planning significant appl... |
169 rows × 13 columns
Векторизация¶
Кодирование по наличию слова в тексте (Flag):
Тексты разбиваются на слова. Далее каждому тексту сопоставляется словарь уникальных слов во всём тексте. Если какое-то слово из словаря встречается в опредленном тексте, то ему проставляется 1, в противном случае 0.
Плюсы:
- Простота использования.
- Скорость генерации вектора.
Минусы:
- Не учитывает порядок слов.
- Не учитывает частотность слов.
- Не учитывает совстречаемость слов.
- Редкие слова могут быть ветеснены словами, которые очень часто встречаются в любых текстах.
Таким образом, вектора для каждого из текстов представлют собой наборы 0 и 1.
Мешок слов (Bag of Words):
В этом случае текст представляется в виде «мешка» из разных слов. Порядок этих слов игнорируется — важна только частота, с которой они встречаются. Для каждого текста создается вектор, где каждый элемент описывает количество вхождений определенного слова из словаря.
Плюсы:
- Простота использования.
- Скорость генерации вектора.
Минусы:
- Не учитывает порядок слов.
- Не учитывает совстречаемость слов.
- Редкие слова могут быть ветеснены словами, которые очень часто встречаются в любых текстах.
Таким образом, вектора для каждого из текстов представлют собой словари с указанием количества упоминаний слова в нём.
Term frequency - Inverse document frequency (TF-IDF):
Это числовой статистический показатель, который отражает важность слова для документа. Формально TF-IDF определяется так:
$$tf = \frac{\text{Частотность слова в документе}}{\text{Общее количество слов в документе}}$$
$$idf = \frac{\text{Количество документов со словом}}{\text{Общее количество документов}}$$
$$\text{tf-idf} = \text{tf} \times \text{idf}$$
Плюсы:
- Простота использования.
- Скорость генерации вектора.
- По-умолчанию определен на уровне документов.
- Учитывает относительную встречаемость слова.
- Учитывает безусловную частотность/редкость слова.
Минусы:
- Не учитывает порядок слов.
- Не учитывает совстречаемость слов.
Таким образом, показатель TF-IDF учитывает значимость слова — чем выше показатель, тем важнее слово.
Базовые подходы¶
def vectorization(texts: Union[list[str], pd.Series, pd.DataFrame], vectorizer_name: str) -> pd.DataFrame():
if vectorizer_name == 'CountVectorizer':
vectorizer = CountVectorizer()
elif vectorizer_name == 'TfidfVectorizer':
vectorizer = TfidfVectorizer()
else:
raise ValueError(f'Неизвестный векторизатор: {vectorizer_name}!')
vectors = vectorizer.fit_transform(texts)
vectors = pd.DataFrame(data=vectors.toarray(), columns=vectorizer.get_feature_names_out())
return vectors
count_vectorization = vectorization(
texts=data['process_abstract'],
vectorizer_name='CountVectorizer',
)
flag_vectorization = count_vectorization.mask(count_vectorization > 1, 1)
tfidf_vectorization = vectorization(
texts=data['process_abstract'],
vectorizer_name='TfidfVectorizer',
)
count_vectorization
| ability | able | abnormal | absence | absorb | abundance | abundant | academia | academic | accept | ... | workload | workstation | world | write | xor | year | yield | york | zero | zone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 164 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 165 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 166 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 167 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 168 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
169 rows × 2592 columns
flag_vectorization
| ability | able | abnormal | absence | absorb | abundance | abundant | academia | academic | accept | ... | workload | workstation | world | write | xor | year | yield | york | zero | zone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 164 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 165 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 166 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 167 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 168 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
169 rows × 2592 columns
tfidf_vectorization
| ability | able | abnormal | absence | absorb | abundance | abundant | academia | academic | accept | ... | workload | workstation | world | write | xor | year | yield | york | zero | zone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.041268 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.072373 | 0.0 | 0.0 | 0.0 |
| 3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.136196 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 164 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 165 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 166 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.084571 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 167 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 168 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
169 rows × 2592 columns
count_vectorization.loc[0].sort_values(ascending=False)
store 6
business 6
datum 6
selection 4
site 4
..
forefront 0
forest 0
forgame 0
forge 0
zone 0
Name: 0, Length: 2592, dtype: int64
flag_vectorization.loc[0].sort_values(ascending=False)
positioning 1
datum 1
rank 1
determine 1
method 1
..
forefront 0
forest 0
forgame 0
forge 0
zone 0
Name: 0, Length: 2592, dtype: int64
tfidf_vectorization.loc[0].sort_values(ascending=False)
business 0.318310
store 0.311331
site 0.243536
baidu 0.237264
dd 0.237264
...
forefront 0.000000
forest 0.000000
forgame 0.000000
forge 0.000000
zone 0.000000
Name: 0, Length: 2592, dtype: float64
Нейросетевые алгоритмы¶
Эмбеддинги — самая популярная технология в NLP и не только. Эмбеддинг — это представление слова/текста/картинки и т.д. в виде вектора низкой размерности. Если объекты для которых получены эмбеддинги близки по смыслу, их векторы будут также похожи. Чтобы добиться такого результата, эмбеддинги обучают на больших массивах текстов с использованием нейронных сетей. У этого подхода много преимуществ по сравнению с другими:
Плюсы:
- Эмбеддинги способны улавливать семантические отношения между словами.
- Низкая размерность векторов.
- Возможность преобразования эмбеддингов с помощью векторных операций и получение осознанных и логичных результатов.
Минусы:
- Сложность реализации.
- Необходимость болших вычислительных мощностей.
- Сильная зависимость от качества и количества обучающих данных.
В качестве основных моделей и подходов для генерации эмбеддингов можно выделить следующие:
- Word2Vec. Векторное представление основывается на контекстной близости: слова, встречающиеся в тексте рядом с одинаковыми словами (а следовательно, имеющие схожий смысл), будут иметь близкие (по косинусному расстоянию) векторы.
- GloVe (Global Vectors for Word Representation). Этот метод основан на матрице совместной встречаемости слов в корпусе текстов. GloVe пытается объединить преимущества двух подходов к моделированию слов: матричной факторизации и предсказательных моделей вроде word2vec, стремясь к получению более точных векторных представлений.
- fastText. Разработанный в Facebook Research, fastText улучшает word2vec за счет обработки целых слов и подсловных единиц (например, n-грамм). Это позволяет fastText генерировать вектора для слов, отсутствующих в обучающем наборе, что является значительным преимуществом для языков с богатой морфологией.
- ELMo (Embeddings from Language Models). ELMo использует модели на основе двунаправленных LSTM (Long Short-Term Memory) сетей для генерации контекстно-зависимых векторных представлений слов. Эти представления богаты на семантическую и синтаксическую информацию, благодаря чему модель более эффективно работает с полисемией и другими языковыми нюансами.
- BERT (Bidirectional Encoder Representations from Transformers). BERT представляет собой революцию в NLP благодаря своей способности обрабатывать слова в контексте всего предложения с обеих сторон (слева направо и справа налево одновременно). Это достигается за счет использования архитектуры Transformer, что позволяет модели лучше понимать контекст и нюансы языка.
- GPT (Generative Pre-trained Transformer). Серия моделей GPT начинается с предобученного на большом корпусе текстов Transformer, который затем может быть дообучен на конкретной задаче NLP. Благодаря мощности и гибкости, модели GPT показывают выдающиеся результаты во многих задачах, включая генерацию текста, перевод, ответы на вопросы и многие другие.
Подробнее остановимся на самом первом алгоритме для генерации эмбеддингов - Word2Vec.
Word2Vec¶
Word2Vec — это популярная модель обучения вложений слов, предложенная исследователями компании Google в 2013 году. Она позволяет преобразовать слова из корпуса текстов в векторы чисел таким образом, что слова с похожими семантическими значениями имеют близкие векторные представления в многомерном пространстве.
Уже обученная модель на новостях от Google c длиной эмбеддингов 300.
gensim_word2vec = downloader.load('word2vec-google-news-300')
len(gensim_word2vec.index_to_key)
3000000
gensim_word2vec.word_vec('king')
array([ 1.25976562e-01, 2.97851562e-02, 8.60595703e-03, 1.39648438e-01,
-2.56347656e-02, -3.61328125e-02, 1.11816406e-01, -1.98242188e-01,
5.12695312e-02, 3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
-1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
3.46679688e-02, 5.21850586e-03, 4.63867188e-02, 1.28906250e-01,
1.36718750e-01, 1.12792969e-01, 5.95703125e-02, 1.36718750e-01,
1.01074219e-01, -1.76757812e-01, -2.51953125e-01, 5.98144531e-02,
3.41796875e-01, -3.11279297e-02, 1.04492188e-01, 6.17675781e-02,
1.24511719e-01, 4.00390625e-01, -3.22265625e-01, 8.39843750e-02,
3.90625000e-02, 5.85937500e-03, 7.03125000e-02, 1.72851562e-01,
1.38671875e-01, -2.31445312e-01, 2.83203125e-01, 1.42578125e-01,
3.41796875e-01, -2.39257812e-02, -1.09863281e-01, 3.32031250e-02,
-5.46875000e-02, 1.53198242e-02, -1.62109375e-01, 1.58203125e-01,
-2.59765625e-01, 2.01416016e-02, -1.63085938e-01, 1.35803223e-03,
-1.44531250e-01, -5.68847656e-02, 4.29687500e-02, -2.46582031e-02,
1.85546875e-01, 4.47265625e-01, 9.58251953e-03, 1.31835938e-01,
9.86328125e-02, -1.85546875e-01, -1.00097656e-01, -1.33789062e-01,
-1.25000000e-01, 2.83203125e-01, 1.23046875e-01, 5.32226562e-02,
-1.77734375e-01, 8.59375000e-02, -2.18505859e-02, 2.05078125e-02,
-1.39648438e-01, 2.51464844e-02, 1.38671875e-01, -1.05468750e-01,
1.38671875e-01, 8.88671875e-02, -7.51953125e-02, -2.13623047e-02,
1.72851562e-01, 4.63867188e-02, -2.65625000e-01, 8.91113281e-03,
1.49414062e-01, 3.78417969e-02, 2.38281250e-01, -1.24511719e-01,
-2.17773438e-01, -1.81640625e-01, 2.97851562e-02, 5.71289062e-02,
-2.89306641e-02, 1.24511719e-02, 9.66796875e-02, -2.31445312e-01,
5.81054688e-02, 6.68945312e-02, 7.08007812e-02, -3.08593750e-01,
-2.14843750e-01, 1.45507812e-01, -4.27734375e-01, -9.39941406e-03,
1.54296875e-01, -7.66601562e-02, 2.89062500e-01, 2.77343750e-01,
-4.86373901e-04, -1.36718750e-01, 3.24218750e-01, -2.46093750e-01,
-3.03649902e-03, -2.11914062e-01, 1.25000000e-01, 2.69531250e-01,
2.04101562e-01, 8.25195312e-02, -2.01171875e-01, -1.60156250e-01,
-3.78417969e-02, -1.20117188e-01, 1.15234375e-01, -4.10156250e-02,
-3.95507812e-02, -8.98437500e-02, 6.34765625e-03, 2.03125000e-01,
1.86523438e-01, 2.73437500e-01, 6.29882812e-02, 1.41601562e-01,
-9.81445312e-02, 1.38671875e-01, 1.82617188e-01, 1.73828125e-01,
1.73828125e-01, -2.37304688e-01, 1.78710938e-01, 6.34765625e-02,
2.36328125e-01, -2.08984375e-01, 8.74023438e-02, -1.66015625e-01,
-7.91015625e-02, 2.43164062e-01, -8.88671875e-02, 1.26953125e-01,
-2.16796875e-01, -1.73828125e-01, -3.59375000e-01, -8.25195312e-02,
-6.49414062e-02, 5.07812500e-02, 1.35742188e-01, -7.47070312e-02,
-1.64062500e-01, 1.15356445e-02, 4.45312500e-01, -2.15820312e-01,
-1.11328125e-01, -1.92382812e-01, 1.70898438e-01, -1.25000000e-01,
2.65502930e-03, 1.92382812e-01, -1.74804688e-01, 1.39648438e-01,
2.92968750e-01, 1.13281250e-01, 5.95703125e-02, -6.39648438e-02,
9.96093750e-02, -2.72216797e-02, 1.96533203e-02, 4.27246094e-02,
-2.46093750e-01, 6.39648438e-02, -2.25585938e-01, -1.68945312e-01,
2.89916992e-03, 8.20312500e-02, 3.41796875e-01, 4.32128906e-02,
1.32812500e-01, 1.42578125e-01, 7.61718750e-02, 5.98144531e-02,
-1.19140625e-01, 2.74658203e-03, -6.29882812e-02, -2.72216797e-02,
-4.82177734e-03, -8.20312500e-02, -2.49023438e-02, -4.00390625e-01,
-1.06933594e-01, 4.24804688e-02, 7.76367188e-02, -1.16699219e-01,
7.37304688e-02, -9.22851562e-02, 1.07910156e-01, 1.58203125e-01,
4.24804688e-02, 1.26953125e-01, 3.61328125e-02, 2.67578125e-01,
-1.01074219e-01, -3.02734375e-01, -5.76171875e-02, 5.05371094e-02,
5.26428223e-04, -2.07031250e-01, -1.38671875e-01, -8.97216797e-03,
-2.78320312e-02, -1.41601562e-01, 2.07031250e-01, -1.58203125e-01,
1.27929688e-01, 1.49414062e-01, -2.24609375e-02, -8.44726562e-02,
1.22558594e-01, 2.15820312e-01, -2.13867188e-01, -3.12500000e-01,
-3.73046875e-01, 4.08935547e-03, 1.07421875e-01, 1.06933594e-01,
7.32421875e-02, 8.97216797e-03, -3.88183594e-02, -1.29882812e-01,
1.49414062e-01, -2.14843750e-01, -1.83868408e-03, 9.91210938e-02,
1.57226562e-01, -1.14257812e-01, -2.05078125e-01, 9.91210938e-02,
3.69140625e-01, -1.97265625e-01, 3.54003906e-02, 1.09375000e-01,
1.31835938e-01, 1.66992188e-01, 2.35351562e-01, 1.04980469e-01,
-4.96093750e-01, -1.64062500e-01, -1.56250000e-01, -5.22460938e-02,
1.03027344e-01, 2.43164062e-01, -1.88476562e-01, 5.07812500e-02,
-9.37500000e-02, -6.68945312e-02, 2.27050781e-02, 7.61718750e-02,
2.89062500e-01, 3.10546875e-01, -5.37109375e-02, 2.28515625e-01,
2.51464844e-02, 6.78710938e-02, -1.21093750e-01, -2.15820312e-01,
-2.73437500e-01, -3.07617188e-02, -3.37890625e-01, 1.53320312e-01,
2.33398438e-01, -2.08007812e-01, 3.73046875e-01, 8.20312500e-02,
2.51953125e-01, -7.61718750e-02, -4.66308594e-02, -2.23388672e-02,
2.99072266e-02, -5.93261719e-02, -4.66918945e-03, -2.44140625e-01,
-2.09960938e-01, -2.87109375e-01, -4.54101562e-02, -1.77734375e-01,
-2.79296875e-01, -8.59375000e-02, 9.13085938e-02, 2.51953125e-01],
dtype=float32)
gensim_word2vec.word_vec('king').shape
(300,)
Визуализация эмбеддингов для 3 слов: king, man, woman. На рисунках ниже видно, что линии из квадратов для мужчины и женщины сильнее похожи друг на друга, нежели чем мужчина на короля, что демонстирует смысловую разницу этих определений.
fig, axes = plt.subplots(figsize=(15, 7.5), nrows=6)
step = 50
for idx, ax in enumerate(axes):
sns.heatmap(
data=np.array([
gensim_word2vec.word_vec('king')[idx*step:(idx+1)*step],
gensim_word2vec.word_vec('man')[idx*step:(idx+1)*step],
gensim_word2vec.word_vec('woman')[idx*step:(idx+1)*step],
]),
linewidths=0.1,
xticklabels=[],
yticklabels=['king', 'man', 'woman'],
cmap='coolwarm',
cbar=False,
square=True,
ax=ax,
);
local = pd.DataFrame(
data={
'king': gensim_word2vec.word_vec('king'),
'man': gensim_word2vec.word_vec('man'),
'woman': gensim_word2vec.word_vec('woman'),
},
)
local.corr()
| king | man | woman | |
|---|---|---|---|
| king | 1.000000 | 0.231538 | 0.129787 |
| man | 0.231538 | 1.000000 | 0.765997 |
| woman | 0.129787 | 0.765997 | 1.000000 |
gensim_word2vec.most_similar('king')
[('kings', 0.7138045430183411),
('queen', 0.6510956883430481),
('monarch', 0.6413194537162781),
('crown_prince', 0.6204220056533813),
('prince', 0.6159993410110474),
('sultan', 0.5864824056625366),
('ruler', 0.5797567367553711),
('princes', 0.5646552443504333),
('Prince_Paras', 0.5432944297790527),
('throne', 0.5422105193138123)]
gensim_word2vec.similarity('king', 'queen')
0.6510957
gensim_word2vec.similarity('king', 'man')
0.22942673
gensim_word2vec.most_similar(
positive=['king', 'woman'],
negative=['man']
)
[('queen', 0.7118193507194519),
('monarch', 0.6189674139022827),
('princess', 0.5902431011199951),
('crown_prince', 0.5499460697174072),
('prince', 0.5377321839332581),
('kings', 0.5236844420433044),
('Queen_Consort', 0.5235945582389832),
('queens', 0.5181134343147278),
('sultan', 0.5098593831062317),
('monarchy', 0.5087411999702454)]
Визуализация отношений между странами и их столицами в формате эмбеддингов пропущенных через метод главных компонент (PCA) для сокращения размерности и возможности отображения на двумерной плоскости.
gensim_word2vec.similarity('russia', 'moscow')
0.5842015
gensim_word2vec.similarity('usa', 'moscow')
0.4821158
gensim_word2vec.most_similar(
positive=['russia', 'tokyo'],
negative=['moscow']
)
[('japan', 0.5613622665405273),
('asia', 0.5138392448425293),
('korea', 0.5004267692565918),
('washington', 0.4972049295902252),
('murdoch', 0.49687281250953674),
('korean', 0.4891359508037567),
('north_korea', 0.48261064291000366),
('japanese', 0.48163819313049316),
('obj', 0.4747077524662018),
('south_korea', 0.47251230478286743)]
Векторизуем наши аннотации статей с помощью Word2Vec. Основной проблемой здесь является то, что эта модель может преобразовывать только слова в векторы, а нам бы хотелось сделать аналогичную операцию, но с текстами. Для этого воспользуемся некоторым допущением и будем считать, что конкретный текст может быть представлен усредненным набором эмбеддингов слов, из которых он состоит.
word2vec_vectorization = flag_vectorization.copy()
for column in word2vec_vectorization:
if column not in gensim_word2vec.index_to_key:
word2vec_vectorization[column] = 0
continue
embedding = gensim_word2vec.word_vec(column)
embedding = np.mean(embedding)
word2vec_vectorization[column] = word2vec_vectorization[column].mask(word2vec_vectorization[column] == 1, embedding)
word2vec_vectorization
| ability | able | abnormal | absence | absorb | abundance | abundant | academia | academic | accept | ... | workload | workstation | world | write | xor | year | yield | york | zero | zone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | -0.013312 | 0.0 | 0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0 | 0.0 | -0.000551 | 0.0 | 0.0 | 0.0 |
| 3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.005286 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 164 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 165 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 166 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | -0.013312 | 0.0 | 0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 167 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
| 168 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.000000 | 0.0 | 0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 |
169 rows × 2592 columns
Поиск похожих статей¶
Интересный факт:
Вспомним определение корреляции Пирсона.
$\rho = \frac{\text{cov}(x, y)}{\sigma (x) \sigma (y)} = \frac{\sum_{i=1}^{N} (x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum_{i=1}^{N} (x_i - \bar{x})^2 \sum_{i=1}^{N} (y_i - \bar{y})^2}}$
Вспомним составляющие его части.
- $\text{cov}(x, y) = \frac{\sum_{i=1}^{N} (x_i - \bar{x})(y_i - \bar{y})}{N - 1}$
- $\sigma (x) = \sqrt{\frac{\sum_{i=1}^{N} (x_i - \bar{x})^2}{N - 1}}$
Что-то отдаленно напоминает...
Посмотрим поближе на скалярное произведение векторов.
$\langle x, y \rangle = \sum_{i=1}^{N} x_i y_i = \| x \|_{2} \| y \|_{2} \cos \alpha \to \cos \alpha = \frac{\sum_{i=1}^{N} x_i y_i}{\| x \|_{2} \| y \|_{2}}$
Начинает вырисовываться некоторое сходство. Наконец, вспомим определение нормы в Евклидовом пространстве.
$\| x \|_{2} = \sqrt{\sum_{i=1}^{N} x_i^2}$
Таким образом имеем следующее: $\cos \alpha = \frac{\sum_{i=1}^{N} x_i y_i}{\sqrt{\sum_{i=1}^{N} x_i^2 \sum_{i=1}^{N} y_i^2}}$.
Получается, что корреляция Пирсона это ничто иное, как косинус угла между децентрированными векторами. Более того, в случае, когда векторы не просто децентрированны, но и нормирован, все три метрики — скалярное произведение, косинус угла, корреляция Пирсона, — будут эквивалентны.
def get_top_words(vectors: pd.DataFrame, top: int):
count_words = vectors.sum().sort_values(ascending=False)
for idx, info in enumerate(zip(count_words.index, count_words.values)):
word, count = info
if idx == top:
break
print(f'{count} \t- {word}')
Ручной вариант по ключевым словам и косинусному сходству¶
Конкертно в этом случае удобно использовать векторизацию на основе бинарзиации наличия слова в тексте (Flag), то есть нам не важно какое количество раз слово встретилось в тексте, а важен сам факт его наличия. Именно поэтому все частоты слов со значением больше 1, приравниваются к 1, но делается это только на стадии поиска похожих статей. Частоты важны на моменте определения набора ключевых слов, по которым в дальнейшем будет осуществляться поиск.
def get_keyword_recommendations(articles: pd.DataFrame, vectors: pd.DataFrame, keywords: list) -> pd.DataFrame():
initial_vector = pd.DataFrame(index=[0], columns=vectors.columns)
initial_vector = initial_vector.fillna(0)
initial_vector.loc[0, keywords] = 1
vectors = vectors.mask(vectors > 1, 1)
articles['cosine_similarity'] = cosine_similarity(vectors, initial_vector)
return articles
get_top_words(count_vectorization, 50)
254 - location 180 - base 178 - user 171 - algorithm 171 - datum 149 - model 132 - query 132 - method 129 - problem 104 - propose 103 - result 102 - paper 98 - approach 84 - image 79 - system 75 - provide 74 - network 73 - new 69 - set 67 - application 67 - recommendation 66 - technique 65 - information 65 - real 64 - probabilistic 62 - social 59 - find 58 - rank 58 - present 57 - study 55 - service 54 - performance 52 - database 50 - large 49 - region 48 - number 47 - use 45 - point 44 - time 43 - spatial 41 - analysis 41 - scale 39 - function 39 - mobile 39 - dataset 38 - experiment 38 - optimal 38 - give 38 - search 37 - measure
keywords = [
'location',
'datum',
'model',
'base',
'problem',
'optimal',
'spatial',
'area',
'search',
'site',
'data',
'place',
'map',
'business',
'store',
'city',
'selection',
]
result = get_keyword_recommendations(
articles=data.copy(),
vectors=flag_vectorization.copy(),
keywords=keywords,
)
result = result.sort_values(by='cosine_similarity', ascending=False)
result = result[[
'title',
'abstract',
'published',
'citation',
'metric',
'cosine_similarity'
]]
result
| title | abstract | published | citation | metric | cosine_similarity | |
|---|---|---|---|---|---|---|
| 9 | Geo-spotting: mining online location-based ser... | The problem of identifying the optimal locatio... | 2013-08-11 | 193.0 | 2151.0 | 0.318397 |
| 0 | Demand driven store site selection via multipl... | Choosing a good location when opening a new st... | 2016-10-31 | 26.0 | 617.0 | 0.312190 |
| 21 | A scalable algorithm for maximizing range sum ... | This paper investigates the MaxRS problem in s... | 2012-07-01 | 51.0 | 310.0 | 0.196039 |
| 30 | Progressive computation of the min-dist optima... | This paper proposes and solves the min-dist op... | 2006-09-01 | 36.0 | 312.0 | 0.194745 |
| 22 | The optimal-location query | We propose and solve the optimal-location quer... | 2005-08-22 | 38.0 | 0.0 | 0.194745 |
| ... | ... | ... | ... | ... | ... | ... |
| 39 | Adaptive Nonlocal Filtering: A Fast Alternativ... | Nonlinear anisotropic diffusion algorithms pro... | 1999-01-01 | 5.0 | 0.0 | 0.000000 |
| 164 | Machine learning approaches for high-resolutio... | The proliferation of several machine learning ... | 2011-05-23 | 18.0 | 526.0 | 0.000000 |
| 51 | Cluster-based probability model and its applic... | We develop, analyze, and apply a specific form... | 1997-02-01 | 23.0 | 0.0 | 0.000000 |
| 50 | Scale-Space and Edge Detection Using Anisotrop... | A new definition of scale-space is suggested, ... | 1990-07-01 | 1696.0 | 0.0 | 0.000000 |
| 77 | An Evaluation of Statistical Approaches to Tex... | This paper focuses on a comparative evaluation... | 1999-04-01 | 495.0 | 0.0 | 0.000000 |
169 rows × 6 columns
for idx, row in result.head(5).iterrows():
print('-----' * 30)
print(f"Title: {row['title']}")
print()
print(f"Abstract: {row['abstract']}")
print('-----' * 30)
------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Geo-spotting: mining online location-based services for optimal retail store placement Abstract: The problem of identifying the optimal location for a new retail store has been the focus of past research, especially in the field of land economy, due to its importance in the success of a business. Traditional approaches to the problem have factored in demographics, revenue and aggregated human flow statistics from nearby or remote areas. However, the acquisition of relevant data is usually expensive. With the growth of location-based social networks, fine grained data describing user mobility and popularity of places has recently become attainable. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Demand driven store site selection via multiple spatial-temporal data Abstract: Choosing a good location when opening a new store is crucial for the future success of a business. Traditional methods include offline manual survey, analytic models based on census data, which are either unable to adapt to the dynamic market or very time consuming. The rapid increase of the availability of big data from various types of mobile devices, such as online query data and offline positioning data, provides us with the possibility to develop automatic and accurate data- driven prediction models for business store site selection. In this paper, we propose a Demand Driven Store Site Selection (DD3S) framework for business store site selection by mining search query data from Baidu Maps. DD3S first detects the spatial-temporal distributions of customer demands on different business services via query data from Baidu Maps, the largest online map search engine in China, and detects the gaps between demand and supply. Then we determine candidate locations via clustering such gaps. In the final stage, we solve the location optimization problem by predicting and ranking the number of customers. We not only deploy supervised regression models to predict the number of customers, but also use learning-to-rank model to directly rank the locations. We evaluate our framework on various types of businesses in real-world cases, and the experiment results demonstrate the effectiveness of our methods. DD3S as the core function for store site selection has already been implemented as a core component of our business analytics platform and could be potentially used by chain store merchants on Baidu Nuomi. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: A scalable algorithm for maximizing range sum in spatial databases Abstract: This paper investigates the MaxRS problem in spatial databases. Given a set O of weighted points and a rectangular region r of a given size, the goal of the MaxRS problem is to find a location of r such that the sum of the weights of all the points covered by r is maximized. This problem is useful in many location-based applications such as finding the best place for a new franchise store with a limited delivery range and finding the most attractive place for a tourist with a limited reachable range. However, the problem has been studied mainly in theory, particularly, in computational geometry. The existing algorithms from the computational geometry community are in-memory algorithms which do not guarantee the scalability. In this paper, we propose a scalable external-memory algorithm (ExactMaxRS) for the MaxRS problem, which is optimal in terms of the I/O complexity. Furthermore, we propose an approximation algorithm (ApproxMaxCRS) for the MaxCRS problem that is a circle version of the MaxRS problem. We prove the correctness and optimality of the ExactMaxRS algorithm along with the approximation bound of the ApproxMaxCRS algorithm. From extensive experimental results, we show that the ExactMaxRS algorithm is two orders of magnitude faster than methods adapted from existing algorithms, and the approximation bound in practice is much better than the theoretical bound of the ApproxMaxCRS algorithm. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Progressive computation of the min-dist optimal-location query Abstract: This paper proposes and solves the min-dist optimal-location query in spatial databases. Given a set S of sites, a set O of weighted objects, and a spatial region Q, the min-dist optimal-location query returns a location in Q which, if a new site is built there, minimizes the average distance from each object to its closest site. This query can help a franchise (e.g. McDonald's) decide where to put a new store in order to maximize the benefit to its customers. To solve this problem is challenging, for there are theoretically infinite number of locations in Q, all of which could be candidates. This paper first provides a theorem that limits the number of candidate locations without losing the power to find exact answers. Then it provides a progressive algorithm that quickly suggests a location, tells the maximum error it may have, and keeps refining the result. When the algorithm finishes, the exact answer can be found. The intermediate result of early runs can be used to prune the search space for later runs. Crucial to the pruning technique are novel lower-bound estimators. The proposed algorithm, the effect of several optimizations, and the progressiveness are experimentally evaluated. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: The optimal-location query Abstract: We propose and solve the optimal-location query in spatial databases. Given a set S of sites, a set O of weighted objects, and a spatial region Q, the optimal-location query returns a location in Q with maximum influence. Here the influence of a location l is the total weight of its RNNs, i.e. the total weight of objects in O that are closer to l than to any site in S. This new query has practical applications, but is very challenging to solve. Existing work on computing RNNs assumes a single query location, and thus cannot be used to compute optimal locations. The reason is that there are infinite candidate locations in Q. If we check a finite set of candidate locations, the result can be inaccurate, i.e. the revealed location may not have maximum influence. This paper proposes three methods that accurately compute optimal locations. The first method uses a standard R*-tree. To compute an optimal location, the method retrieves certain objects from the R*-tree and sends them as a stream to a plane-sweep algorithm, which uses a new data structure called the aSB-tree to ensure query efficiency. The second method is based on a new index structure called the OL-tree, which novelly extends the k-d-B-tree to store segmented rectangular records. The OL-tree is only of theoretical usage for it is not space efficient. The most practical approach is based on a new index structure called the Virtual OL-tree. These methods are theoretically and experimentally evaluated. ------------------------------------------------------------------------------------------------------------------------------------------------------
Автоматизированный вариант по аннотациям и косинусному сходству¶
def get_abstract_recommendations(articles: pd.DataFrame, vectors: pd.DataFrame, initial_vector: pd.DataFrame) -> pd.DataFrame():
articles['cosine_similarity'] = cosine_similarity(vectors, initial_vector)
articles = articles.sort_values(by='cosine_similarity', ascending=False)
articles = articles[[
'title',
'abstract',
'published',
'citation',
'metric',
'cosine_similarity'
]]
return articles
result_flag_vectorization = get_abstract_recommendations(
articles=data.copy(),
vectors=flag_vectorization.copy(),
initial_vector=flag_vectorization.loc[[0], :],
)
result_count_vectorization = get_abstract_recommendations(
articles=data.copy(),
vectors=count_vectorization.copy(),
initial_vector=count_vectorization.loc[[0], :],
)
result_tfidf_vectorization = get_abstract_recommendations(
articles=data.copy(),
vectors=tfidf_vectorization.copy(),
initial_vector=tfidf_vectorization.loc[[0], :],
)
result_word2vec_vectorization = get_abstract_recommendations(
articles=data.copy(),
vectors=word2vec_vectorization.copy(),
initial_vector=word2vec_vectorization.loc[[0], :],
)
result_flag_vectorization
| title | abstract | published | citation | metric | cosine_similarity | |
|---|---|---|---|---|---|---|
| 0 | Demand driven store site selection via multipl... | Choosing a good location when opening a new st... | 2016-10-31 | 26.0 | 617.0 | 1.000000 |
| 79 | Location-based and preference-aware recommenda... | The popularity of location-based social networ... | 2012-11-06 | 503.0 | 3713.0 | 0.248784 |
| 65 | Semi-supervised document retrieval | This paper proposes a new machine learning met... | 2009-05-01 | 21.0 | 0.0 | 0.241932 |
| 6 | Exploiting geographic dependencies for real es... | It is traditionally a challenge for home buyer... | 2014-08-24 | 81.0 | 1076.0 | 0.237429 |
| 31 | MaxFirst for MaxBRkNN | The MaxBRNN problem finds a region such that s... | 2011-04-11 | 26.0 | 0.0 | 0.229835 |
| ... | ... | ... | ... | ... | ... | ... |
| 152 | Improved use of continuous attributes in C4.5 | A reported weakness of C4.5 in domains with co... | 1996-03-01 | 260.0 | 0.0 | 0.044281 |
| 164 | Machine learning approaches for high-resolutio... | The proliferation of several machine learning ... | 2011-05-23 | 18.0 | 526.0 | 0.042875 |
| 12 | Scikit-learn: Machine Learning in Python | Scikit-learn is a Python module integrating a ... | 2011-11-01 | 7648.0 | 19849.0 | 0.042008 |
| 55 | Bilateral Filtering for Gray and Color Images | Bilateral filtering smooths images while prese... | 1998-01-04 | 793.0 | 0.0 | 0.027730 |
| 95 | Comparing Top k Lists | Motivated by several applications, we introduc... | 2004-01-01 | 218.0 | 0.0 | 0.000000 |
169 rows × 6 columns
result_count_vectorization
| title | abstract | published | citation | metric | cosine_similarity | |
|---|---|---|---|---|---|---|
| 0 | Demand driven store site selection via multipl... | Choosing a good location when opening a new st... | 2016-10-31 | 26.0 | 617.0 | 1.000000 |
| 159 | A taxi business intelligence system | The increasing availability of large-scale loc... | 2011-08-21 | 54.0 | 1288.0 | 0.348732 |
| 7 | Optimal network location queries | Given a set S of sites and a set O of weighted... | 2010-11-02 | 21.0 | 199.0 | 0.313756 |
| 30 | Progressive computation of the min-dist optima... | This paper proposes and solves the min-dist op... | 2006-09-01 | 36.0 | 312.0 | 0.305847 |
| 65 | Semi-supervised document retrieval | This paper proposes a new machine learning met... | 2009-05-01 | 21.0 | 0.0 | 0.296108 |
| ... | ... | ... | ... | ... | ... | ... |
| 51 | Cluster-based probability model and its applic... | We develop, analyze, and apply a specific form... | 1997-02-01 | 23.0 | 0.0 | 0.020885 |
| 152 | Improved use of continuous attributes in C4.5 | A reported weakness of C4.5 in domains with co... | 1996-03-01 | 260.0 | 0.0 | 0.019557 |
| 164 | Machine learning approaches for high-resolutio... | The proliferation of several machine learning ... | 2011-05-23 | 18.0 | 526.0 | 0.017969 |
| 55 | Bilateral Filtering for Gray and Color Images | Bilateral filtering smooths images while prese... | 1998-01-04 | 793.0 | 0.0 | 0.012093 |
| 95 | Comparing Top k Lists | Motivated by several applications, we introduc... | 2004-01-01 | 218.0 | 0.0 | 0.000000 |
169 rows × 6 columns
result_tfidf_vectorization
| title | abstract | published | citation | metric | cosine_similarity | |
|---|---|---|---|---|---|---|
| 0 | Demand driven store site selection via multipl... | Choosing a good location when opening a new st... | 2016-10-31 | 26.0 | 617.0 | 1.000000 |
| 159 | A taxi business intelligence system | The increasing availability of large-scale loc... | 2011-08-21 | 54.0 | 1288.0 | 0.251345 |
| 7 | Optimal network location queries | Given a set S of sites and a set O of weighted... | 2010-11-02 | 21.0 | 199.0 | 0.250829 |
| 31 | MaxFirst for MaxBRkNN | The MaxBRNN problem finds a region such that s... | 2011-04-11 | 26.0 | 0.0 | 0.221271 |
| 30 | Progressive computation of the min-dist optima... | This paper proposes and solves the min-dist op... | 2006-09-01 | 36.0 | 312.0 | 0.217079 |
| ... | ... | ... | ... | ... | ... | ... |
| 40 | The estimation of the gradient of a density fu... | Nonparametric density gradient estimation usin... | 2006-09-01 | 405.0 | 0.0 | 0.011558 |
| 155 | Getting from here to there: interactive planni... | Planning and monitoring a trip is a common but... | 2002-07-28 | 7.0 | 0.0 | 0.009749 |
| 152 | Improved use of continuous attributes in C4.5 | A reported weakness of C4.5 in domains with co... | 1996-03-01 | 260.0 | 0.0 | 0.006445 |
| 55 | Bilateral Filtering for Gray and Color Images | Bilateral filtering smooths images while prese... | 1998-01-04 | 793.0 | 0.0 | 0.003005 |
| 95 | Comparing Top k Lists | Motivated by several applications, we introduc... | 2004-01-01 | 218.0 | 0.0 | 0.000000 |
169 rows × 6 columns
result_word2vec_vectorization
| title | abstract | published | citation | metric | cosine_similarity | |
|---|---|---|---|---|---|---|
| 0 | Demand driven store site selection via multipl... | Choosing a good location when opening a new st... | 2016-10-31 | 26.0 | 617.0 | 1.000000 |
| 135 | Mining significant semantic locations from GPS... | With the increasing deployment and use of GPS-... | 2010-09-01 | 254.0 | 1734.0 | 0.261367 |
| 144 | Location-based recommendation system using Bay... | As wireless communication advances, research o... | 2007-07-11 | 75.0 | 0.0 | 0.251195 |
| 7 | Optimal network location queries | Given a set S of sites and a set O of weighted... | 2010-11-02 | 21.0 | 199.0 | 0.241299 |
| 6 | Exploiting geographic dependencies for real es... | It is traditionally a challenge for home buyer... | 2014-08-24 | 81.0 | 1076.0 | 0.239075 |
| ... | ... | ... | ... | ... | ... | ... |
| 55 | Bilateral Filtering for Gray and Color Images | Bilateral filtering smooths images while prese... | 1998-01-04 | 793.0 | 0.0 | 0.016474 |
| 164 | Machine learning approaches for high-resolutio... | The proliferation of several machine learning ... | 2011-05-23 | 18.0 | 526.0 | 0.013971 |
| 97 | Provenance semirings | We show that relational algebra calculations f... | 2007-06-11 | 511.0 | 2427.0 | 0.011599 |
| 152 | Improved use of continuous attributes in C4.5 | A reported weakness of C4.5 in domains with co... | 1996-03-01 | 260.0 | 0.0 | 0.010914 |
| 95 | Comparing Top k Lists | Motivated by several applications, we introduc... | 2004-01-01 | 218.0 | 0.0 | 0.000000 |
169 rows × 6 columns
for idx, row in result_flag_vectorization.head(5).iterrows():
print('-----' * 30)
print(f"Title: {row['title']}")
print()
print(f"Abstract: {row['abstract']}")
print('-----' * 30)
------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Demand driven store site selection via multiple spatial-temporal data Abstract: Choosing a good location when opening a new store is crucial for the future success of a business. Traditional methods include offline manual survey, analytic models based on census data, which are either unable to adapt to the dynamic market or very time consuming. The rapid increase of the availability of big data from various types of mobile devices, such as online query data and offline positioning data, provides us with the possibility to develop automatic and accurate data- driven prediction models for business store site selection. In this paper, we propose a Demand Driven Store Site Selection (DD3S) framework for business store site selection by mining search query data from Baidu Maps. DD3S first detects the spatial-temporal distributions of customer demands on different business services via query data from Baidu Maps, the largest online map search engine in China, and detects the gaps between demand and supply. Then we determine candidate locations via clustering such gaps. In the final stage, we solve the location optimization problem by predicting and ranking the number of customers. We not only deploy supervised regression models to predict the number of customers, but also use learning-to-rank model to directly rank the locations. We evaluate our framework on various types of businesses in real-world cases, and the experiment results demonstrate the effectiveness of our methods. DD3S as the core function for store site selection has already been implemented as a core component of our business analytics platform and could be potentially used by chain store merchants on Baidu Nuomi. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Location-based and preference-aware recommendation using sparse geo-social networking data Abstract: The popularity of location-based social networks provide us with a new platform to understand users' preferences based on their location histories. In this paper, we present a location-based and preference-aware recommender system that offers a particular user a set of venues (such as restaurants) within a geospatial range with the consideration of both: 1) User preferences, which are automatically learned from her location history and 2) Social opinions, which are mined from the location histories of the local experts. This recommender system can facilitate people's travel not only near their living areas but also to a city that is new to them. As a user can only visit a limited number of locations, the user-locations matrix is very sparse, leading to a big challenge to traditional collaborative filtering-based location recommender systems. The problem becomes even more challenging when people travel to a new city. To this end, we propose a novel location recommender system, which consists of two main parts: offline modeling and online recommendation. The offline modeling part models each individual's personal preferences with a weighted category hierarchy (WCH) and infers the expertise of each user in a city with respect to different category of locations according to their location histories using an iterative learning model. The online recommendation part selects candidate local experts in a geospatial range that matches the user's preferences using a preference-aware candidate selection algorithm and then infers a score of the candidate locations based on the opinions of the selected local experts. Finally, the top-k ranked locations are returned as the recommendations for the user. We evaluated our system with a large-scale real dataset collected from Foursquare. The results confirm that our method offers more effective recommendations than baselines, while having a good efficiency of providing location recommendations. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Semi-supervised document retrieval Abstract: This paper proposes a new machine learning method for constructing ranking models in document retrieval. The method, which is referred to as SSRank, aims to use the advantages of both the traditional Information Retrieval (IR) methods and the supervised learning methods for IR proposed recently. The advantages include the use of limited amount of labeled data and rich model representation. To do so, the method adopts a semi-supervised learning framework in ranking model construction. Specifically, given a small number of labeled documents with respect to some queries, the method effectively labels the unlabeled documents for the queries. It then uses all the labeled data to train a machine learning model (in our case, Neural Network). In the data labeling, the method also makes use of a traditional IR model (in our case, BM25). A stopping criterion based on machine learning theory is given for the data labeling process. Experimental results on three benchmark datasets and one web search dataset indicate that SSRank consistently and almost always significantly outperforms the baseline methods (unsupervised and supervised learning methods), given the same amount of labeled data. This is because SSRank can effectively leverage the use of unlabeled data in learning. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Exploiting geographic dependencies for real estate appraisal: a mutual perspective of ranking and clustering Abstract: It is traditionally a challenge for home buyers to understand, compare and contrast the investment values of real estates. While a number of estate appraisal methods have been developed to value real property, the performances of these methods have been limited by the traditional data sources for estate appraisal. However, with the development of new ways of collecting estate-related mobile data, there is a potential to leverage geographic dependencies of estates for enhancing estate appraisal. Indeed, the geographic dependencies of the value of an estate can be from the characteristics of its own neighborhood (individual), the values of its nearby estates (peer), and the prosperity of the affiliated latent business area (zone). To this end, in this paper, we propose a geographic method, named ClusRanking, for estate appraisal by leveraging the mutual enforcement of ranking and clustering power. ClusRanking is able to exploit geographic individual, peer, and zone dependencies in a probabilistic ranking model. Specifically, we first extract the geographic utility of estates from geography data, estimate the neighborhood popularity of estates by mining taxicab trajectory data, and model the influence of latent business areas via ClusRanking. Also, we use a linear model to fuse these three influential factors and predict estate investment values. Moreover, we simultaneously consider individual, peer and zone dependencies, and derive an estate-specific ranking likelihood as the objective function. Finally, we conduct a comprehensive evaluation with real-world estate related data, and the experimental results demonstrate the effectiveness of our method. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: MaxFirst for MaxBRkNN Abstract: The MaxBRNN problem finds a region such that setting up a new service site within this region would guarantee the maximum number of customers by proximity. This problem assumes that each customer only uses the service provided by his/her nearest service site. However, in reality, a customer tends to go to his/her k nearest service sites. To handle this, MaxBRNN can be extended to the MaxBRkNN problem which finds an optimal region such that setting up a service site in this region guarantees the maximum number of customers who would consider the site as one of their k nearest service locations. We further generalize the MaxBRkNN problem to reflect the real world scenario where customers may have different preferences for different service sites, and at the same time, service sites may have preferred targeted customers. In this paper, we present an efficient solution called MaxFirst to solve this generalized MaxBRkNN problem. The algorithm works by partitioning the space into quadrants and searches only in those quadrants that potentially contain an optimal region. During the space partitioning, we compute the upper and lower bounds of the size of a quadrant's BRkNN, and use these bounds to prune the unpromising quadrants. Experiment results show that MaxFirst can be two to three orders of magnitude faster than the state-of-the-art algorithm. ------------------------------------------------------------------------------------------------------------------------------------------------------
for idx, row in result_count_vectorization.head(5).iterrows():
print('-----' * 30)
print(f"Title: {row['title']}")
print()
print(f"Abstract: {row['abstract']}")
print('-----' * 30)
------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Demand driven store site selection via multiple spatial-temporal data Abstract: Choosing a good location when opening a new store is crucial for the future success of a business. Traditional methods include offline manual survey, analytic models based on census data, which are either unable to adapt to the dynamic market or very time consuming. The rapid increase of the availability of big data from various types of mobile devices, such as online query data and offline positioning data, provides us with the possibility to develop automatic and accurate data- driven prediction models for business store site selection. In this paper, we propose a Demand Driven Store Site Selection (DD3S) framework for business store site selection by mining search query data from Baidu Maps. DD3S first detects the spatial-temporal distributions of customer demands on different business services via query data from Baidu Maps, the largest online map search engine in China, and detects the gaps between demand and supply. Then we determine candidate locations via clustering such gaps. In the final stage, we solve the location optimization problem by predicting and ranking the number of customers. We not only deploy supervised regression models to predict the number of customers, but also use learning-to-rank model to directly rank the locations. We evaluate our framework on various types of businesses in real-world cases, and the experiment results demonstrate the effectiveness of our methods. DD3S as the core function for store site selection has already been implemented as a core component of our business analytics platform and could be potentially used by chain store merchants on Baidu Nuomi. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: A taxi business intelligence system Abstract: The increasing availability of large-scale location traces creates unprecedent opportunities to change the paradigm for knowledge discovery in transportation systems. A particularly promising area is to extract useful business intelligence, which can be used as guidance for reducing inefficiencies in energy consumption of transportation sectors, improving customer experiences, and increasing business performances. However, extracting business intelligence from location traces is not a trivial task. Conventional data analytic tools are usually not customized for handling large, complex, dynamic, and distributed nature of location traces. To that end, we develop a taxi business intelligence system to explore the massive taxi location traces from different business perspectives with various data mining functions. Since we implement the system using the real-world taxi GPS data, this demonstration will help taxi companies to improve their business performances by understanding the behaviors of both drivers and customers. In addition, several identified technical challenges also motivate data mining people to develop more sophisticate techniques in the future. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Optimal network location queries Abstract: Given a set S of sites and a set O of weighted objects, an optimal location query finds the location(s) where introducing a new site maximizes the total weight of the objects that are closer to the new site than to any other site. With such a query, for instance, a franchise corporation (e.g., McDonald's) can find a location to open a new store such that the number of potential store customers (i.e., people living close to the store) is maximized. Optimal location queries are computationally complex to compute and require efficient solutions that scale with large datasets. Previously, two specific approaches have been proposed for efficient computation of optimal location queries. However, they both assume p-norm distance (namely, L1 and L2/Euclidean); hence, they are not applicable where sites and objects are located on spatial networks. In this paper, we focus on optimal network location (ONL) queries, i.e., optimal location queries with which objects and sites reside on a spatial network. We introduce an approach, namely EONL (short for Expansion-based ONL), which enables efficient computation of ONL queries. Moreover, with an extensive experimental study we verify and compare the efficiency of our proposed approach with real datasets, and we demonstrate the importance of considering network distance (rather than p-norm distance) with ONL queries. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Progressive computation of the min-dist optimal-location query Abstract: This paper proposes and solves the min-dist optimal-location query in spatial databases. Given a set S of sites, a set O of weighted objects, and a spatial region Q, the min-dist optimal-location query returns a location in Q which, if a new site is built there, minimizes the average distance from each object to its closest site. This query can help a franchise (e.g. McDonald's) decide where to put a new store in order to maximize the benefit to its customers. To solve this problem is challenging, for there are theoretically infinite number of locations in Q, all of which could be candidates. This paper first provides a theorem that limits the number of candidate locations without losing the power to find exact answers. Then it provides a progressive algorithm that quickly suggests a location, tells the maximum error it may have, and keeps refining the result. When the algorithm finishes, the exact answer can be found. The intermediate result of early runs can be used to prune the search space for later runs. Crucial to the pruning technique are novel lower-bound estimators. The proposed algorithm, the effect of several optimizations, and the progressiveness are experimentally evaluated. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Semi-supervised document retrieval Abstract: This paper proposes a new machine learning method for constructing ranking models in document retrieval. The method, which is referred to as SSRank, aims to use the advantages of both the traditional Information Retrieval (IR) methods and the supervised learning methods for IR proposed recently. The advantages include the use of limited amount of labeled data and rich model representation. To do so, the method adopts a semi-supervised learning framework in ranking model construction. Specifically, given a small number of labeled documents with respect to some queries, the method effectively labels the unlabeled documents for the queries. It then uses all the labeled data to train a machine learning model (in our case, Neural Network). In the data labeling, the method also makes use of a traditional IR model (in our case, BM25). A stopping criterion based on machine learning theory is given for the data labeling process. Experimental results on three benchmark datasets and one web search dataset indicate that SSRank consistently and almost always significantly outperforms the baseline methods (unsupervised and supervised learning methods), given the same amount of labeled data. This is because SSRank can effectively leverage the use of unlabeled data in learning. ------------------------------------------------------------------------------------------------------------------------------------------------------
for idx, row in result_tfidf_vectorization.head(5).iterrows():
print('-----' * 30)
print(f"Title: {row['title']}")
print()
print(f"Abstract: {row['abstract']}")
print('-----' * 30)
------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Demand driven store site selection via multiple spatial-temporal data Abstract: Choosing a good location when opening a new store is crucial for the future success of a business. Traditional methods include offline manual survey, analytic models based on census data, which are either unable to adapt to the dynamic market or very time consuming. The rapid increase of the availability of big data from various types of mobile devices, such as online query data and offline positioning data, provides us with the possibility to develop automatic and accurate data- driven prediction models for business store site selection. In this paper, we propose a Demand Driven Store Site Selection (DD3S) framework for business store site selection by mining search query data from Baidu Maps. DD3S first detects the spatial-temporal distributions of customer demands on different business services via query data from Baidu Maps, the largest online map search engine in China, and detects the gaps between demand and supply. Then we determine candidate locations via clustering such gaps. In the final stage, we solve the location optimization problem by predicting and ranking the number of customers. We not only deploy supervised regression models to predict the number of customers, but also use learning-to-rank model to directly rank the locations. We evaluate our framework on various types of businesses in real-world cases, and the experiment results demonstrate the effectiveness of our methods. DD3S as the core function for store site selection has already been implemented as a core component of our business analytics platform and could be potentially used by chain store merchants on Baidu Nuomi. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: A taxi business intelligence system Abstract: The increasing availability of large-scale location traces creates unprecedent opportunities to change the paradigm for knowledge discovery in transportation systems. A particularly promising area is to extract useful business intelligence, which can be used as guidance for reducing inefficiencies in energy consumption of transportation sectors, improving customer experiences, and increasing business performances. However, extracting business intelligence from location traces is not a trivial task. Conventional data analytic tools are usually not customized for handling large, complex, dynamic, and distributed nature of location traces. To that end, we develop a taxi business intelligence system to explore the massive taxi location traces from different business perspectives with various data mining functions. Since we implement the system using the real-world taxi GPS data, this demonstration will help taxi companies to improve their business performances by understanding the behaviors of both drivers and customers. In addition, several identified technical challenges also motivate data mining people to develop more sophisticate techniques in the future. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Optimal network location queries Abstract: Given a set S of sites and a set O of weighted objects, an optimal location query finds the location(s) where introducing a new site maximizes the total weight of the objects that are closer to the new site than to any other site. With such a query, for instance, a franchise corporation (e.g., McDonald's) can find a location to open a new store such that the number of potential store customers (i.e., people living close to the store) is maximized. Optimal location queries are computationally complex to compute and require efficient solutions that scale with large datasets. Previously, two specific approaches have been proposed for efficient computation of optimal location queries. However, they both assume p-norm distance (namely, L1 and L2/Euclidean); hence, they are not applicable where sites and objects are located on spatial networks. In this paper, we focus on optimal network location (ONL) queries, i.e., optimal location queries with which objects and sites reside on a spatial network. We introduce an approach, namely EONL (short for Expansion-based ONL), which enables efficient computation of ONL queries. Moreover, with an extensive experimental study we verify and compare the efficiency of our proposed approach with real datasets, and we demonstrate the importance of considering network distance (rather than p-norm distance) with ONL queries. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: MaxFirst for MaxBRkNN Abstract: The MaxBRNN problem finds a region such that setting up a new service site within this region would guarantee the maximum number of customers by proximity. This problem assumes that each customer only uses the service provided by his/her nearest service site. However, in reality, a customer tends to go to his/her k nearest service sites. To handle this, MaxBRNN can be extended to the MaxBRkNN problem which finds an optimal region such that setting up a service site in this region guarantees the maximum number of customers who would consider the site as one of their k nearest service locations. We further generalize the MaxBRkNN problem to reflect the real world scenario where customers may have different preferences for different service sites, and at the same time, service sites may have preferred targeted customers. In this paper, we present an efficient solution called MaxFirst to solve this generalized MaxBRkNN problem. The algorithm works by partitioning the space into quadrants and searches only in those quadrants that potentially contain an optimal region. During the space partitioning, we compute the upper and lower bounds of the size of a quadrant's BRkNN, and use these bounds to prune the unpromising quadrants. Experiment results show that MaxFirst can be two to three orders of magnitude faster than the state-of-the-art algorithm. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Progressive computation of the min-dist optimal-location query Abstract: This paper proposes and solves the min-dist optimal-location query in spatial databases. Given a set S of sites, a set O of weighted objects, and a spatial region Q, the min-dist optimal-location query returns a location in Q which, if a new site is built there, minimizes the average distance from each object to its closest site. This query can help a franchise (e.g. McDonald's) decide where to put a new store in order to maximize the benefit to its customers. To solve this problem is challenging, for there are theoretically infinite number of locations in Q, all of which could be candidates. This paper first provides a theorem that limits the number of candidate locations without losing the power to find exact answers. Then it provides a progressive algorithm that quickly suggests a location, tells the maximum error it may have, and keeps refining the result. When the algorithm finishes, the exact answer can be found. The intermediate result of early runs can be used to prune the search space for later runs. Crucial to the pruning technique are novel lower-bound estimators. The proposed algorithm, the effect of several optimizations, and the progressiveness are experimentally evaluated. ------------------------------------------------------------------------------------------------------------------------------------------------------
for idx, row in result_word2vec_vectorization.head(5).iterrows():
print('-----' * 30)
print(f"Title: {row['title']}")
print()
print(f"Abstract: {row['abstract']}")
print('-----' * 30)
------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Demand driven store site selection via multiple spatial-temporal data Abstract: Choosing a good location when opening a new store is crucial for the future success of a business. Traditional methods include offline manual survey, analytic models based on census data, which are either unable to adapt to the dynamic market or very time consuming. The rapid increase of the availability of big data from various types of mobile devices, such as online query data and offline positioning data, provides us with the possibility to develop automatic and accurate data- driven prediction models for business store site selection. In this paper, we propose a Demand Driven Store Site Selection (DD3S) framework for business store site selection by mining search query data from Baidu Maps. DD3S first detects the spatial-temporal distributions of customer demands on different business services via query data from Baidu Maps, the largest online map search engine in China, and detects the gaps between demand and supply. Then we determine candidate locations via clustering such gaps. In the final stage, we solve the location optimization problem by predicting and ranking the number of customers. We not only deploy supervised regression models to predict the number of customers, but also use learning-to-rank model to directly rank the locations. We evaluate our framework on various types of businesses in real-world cases, and the experiment results demonstrate the effectiveness of our methods. DD3S as the core function for store site selection has already been implemented as a core component of our business analytics platform and could be potentially used by chain store merchants on Baidu Nuomi. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Mining significant semantic locations from GPS data Abstract: With the increasing deployment and use of GPS-enabled devices, massive amounts of GPS data are becoming available. We propose a general framework for the mining of semantically meaningful, significant locations, e.g., shopping malls and restaurants, from such data. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Location-based recommendation system using Bayesian user's preference model in mobile devices Abstract: As wireless communication advances, research on location-based services using mobile devices has attracted interest, which provides information and services related to user's physical location. As increasing information and services, it becomes difficult to find a proper service that reflects the individual preference at proper time. Due to the small screen of mobile devices and insufficiency of resources, personalized services and convenient user interface might be useful. In this paper, we propose a map-based personalized recommendation system which reflects user's preference modeled by Bayesian Networks (BN). The structure of BN is built by an expert while the parameter is learned from the dataset. The proposed system collects context information, location, time, weather, and user request from the mobile device and infers the most preferred item to provide an appropriate service by displaying onto the mini map. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Optimal network location queries Abstract: Given a set S of sites and a set O of weighted objects, an optimal location query finds the location(s) where introducing a new site maximizes the total weight of the objects that are closer to the new site than to any other site. With such a query, for instance, a franchise corporation (e.g., McDonald's) can find a location to open a new store such that the number of potential store customers (i.e., people living close to the store) is maximized. Optimal location queries are computationally complex to compute and require efficient solutions that scale with large datasets. Previously, two specific approaches have been proposed for efficient computation of optimal location queries. However, they both assume p-norm distance (namely, L1 and L2/Euclidean); hence, they are not applicable where sites and objects are located on spatial networks. In this paper, we focus on optimal network location (ONL) queries, i.e., optimal location queries with which objects and sites reside on a spatial network. We introduce an approach, namely EONL (short for Expansion-based ONL), which enables efficient computation of ONL queries. Moreover, with an extensive experimental study we verify and compare the efficiency of our proposed approach with real datasets, and we demonstrate the importance of considering network distance (rather than p-norm distance) with ONL queries. ------------------------------------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------------------------------------ Title: Exploiting geographic dependencies for real estate appraisal: a mutual perspective of ranking and clustering Abstract: It is traditionally a challenge for home buyers to understand, compare and contrast the investment values of real estates. While a number of estate appraisal methods have been developed to value real property, the performances of these methods have been limited by the traditional data sources for estate appraisal. However, with the development of new ways of collecting estate-related mobile data, there is a potential to leverage geographic dependencies of estates for enhancing estate appraisal. Indeed, the geographic dependencies of the value of an estate can be from the characteristics of its own neighborhood (individual), the values of its nearby estates (peer), and the prosperity of the affiliated latent business area (zone). To this end, in this paper, we propose a geographic method, named ClusRanking, for estate appraisal by leveraging the mutual enforcement of ranking and clustering power. ClusRanking is able to exploit geographic individual, peer, and zone dependencies in a probabilistic ranking model. Specifically, we first extract the geographic utility of estates from geography data, estimate the neighborhood popularity of estates by mining taxicab trajectory data, and model the influence of latent business areas via ClusRanking. Also, we use a linear model to fuse these three influential factors and predict estate investment values. Moreover, we simultaneously consider individual, peer and zone dependencies, and derive an estate-specific ranking likelihood as the objective function. Finally, we conduct a comprehensive evaluation with real-world estate related data, and the experimental results demonstrate the effectiveness of our method. ------------------------------------------------------------------------------------------------------------------------------------------------------
Сравнение рекомендаций¶
recommendations = {
'Custom': result,
'Flag': result_flag_vectorization,
'Count': result_count_vectorization,
'Tf-Idf': result_tfidf_vectorization,
'Word2Vec': result_word2vec_vectorization,
}
for method, recommandation in recommendations.items():
data = pd.concat([data, recommandation[['cosine_similarity']].rename(columns={'cosine_similarity': f'{method} method'})], axis=1)
data
| url | deapth | title | authors | source | number and pages | doi | published | citation | metric | abstract | references | process_abstract | Custom method | Flag method | Count method | Tf-Idf method | Word2Vec method | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 0 | Demand driven store site selection via multipl... | [Mengwen Xu, Tianyi Wang, Zhengwei Wu, Jingbo ... | SIGSPACIAL '16: Proceedings of the 24th ACM SI... | Article No.: 40, Pages 1 - 10 | https://doi.org/10.1145/2996913.2996996 | 2016-10-31 | 26.0 | 617.0 | Choosing a good location when opening a new st... | [https://dl.acm.org/doi/10.1016/S0305-0548(01)... | choose good location open new store crucial fu... | 0.312190 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 1 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | The generalized maximal covering location problem | [Oded Berman, Dmitry Krass] | Computers and Operations Research | NaN | https://doi.org/10.1016/S0305-0548(01)00079-X | 2002-05-01 | 34.0 | 0.0 | We consider a generalization of the maximal co... | [] | consider generalization maximal cover location... | 0.146254 | 0.089562 | 0.150723 | 0.064708 | 0.170892 |
| 2 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Random Forests | [Leo Breiman] | Machine Learning | NaN | https://doi.org/10.1023/A:1010933404324 | 2001-10-01 | 9828.0 | 0.0 | Random forests are a combination of tree predi... | [https://dl.acm.org/doi/10.1162/neco.1997.9.7.... | random forest combination tree predictor tree ... | 0.033315 | 0.081604 | 0.048373 | 0.034469 | 0.080029 |
| 3 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Efficient algorithms for optimal location quer... | [Zitong Chen, Yubao Liu, Raymond Chi-Wing Wong... | SIGMOD '14: Proceedings of the 2014 ACM SIGMOD... | NaN | https://doi.org/10.1145/2588555.2612172 | 2014-06-18 | 47.0 | 790.0 | In this paper, we study the optimal location q... | [https://dl.acm.org/doi/10.14778/2350229.23502... | paper study optimal location query problem bas... | 0.126302 | 0.180469 | 0.203188 | 0.093193 | 0.210058 |
| 4 | https://dl.acm.org/doi/10.1145/2996913.2996996 | 1 | Mean Shift: A Robust Approach Toward Feature S... | [Dorin Comaniciu, Peter Meer] | IEEE Transactions on Pattern Analysis and Mach... | NaN | https://doi.org/10.1109/34.1000236 | 2002-05-01 | 2062.0 | 0.0 | A general nonparametric technique is proposed ... | [https://dl.acm.org/doi/10.1007/BF00128233, ht... | general nonparametric technique propose analys... | 0.057567 | 0.094007 | 0.082959 | 0.038648 | 0.122549 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 164 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Machine learning approaches for high-resolutio... | [Ranga Raju Vatsavai, Eddie Bright, Chandola V... | COM.Geo '11: Proceedings of the 2nd Internatio... | Article No.: 11, Pages 1 - 10 | https://doi.org/10.1145/1999320.1999331 | 2011-05-23 | 18.0 | 526.0 | The proliferation of several machine learning ... | [https://dl.acm.org/doi/10.5555/1191551.119179... | proliferation machine learning approach make d... | 0.000000 | 0.042875 | 0.017969 | 0.015740 | 0.013971 |
| 165 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Geographical topic discovery and comparison | [Zhijun Yin, Liangliang Cao, Jiawei Han, Cheng... | WWW '11: Proceedings of the 20th international... | NaN | https://doi.org/10.1145/1963405.1963443 | 2011-03-28 | 232.0 | 1642.0 | This paper studies the problem of discovering ... | [https://dl.acm.org/doi/10.5555/944919.944937,... | paper study problem discover compare geographi... | 0.080845 | 0.132020 | 0.194915 | 0.082114 | 0.126442 |
| 166 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Driving with knowledge from the physical world | [Jing Yuan, Yu Zheng, Xing Xie, Guangzhong Sun] | KDD '11: Proceedings of the 17th ACM SIGKDD in... | NaN | https://doi.org/10.1145/2020408.2020462 | 2011-08-21 | 641.0 | 2908.0 | This paper presents a Cloud-based system compu... | [https://dl.acm.org/doi/10.1016/j.eswa.2008.07... | paper present cloud base system computing cust... | 0.109150 | 0.178240 | 0.113181 | 0.076104 | 0.132803 |
| 167 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Where to find my next passenger | [Jing Yuan, Yu Zheng, Liuhang Zhang, XIng Xie,... | UbiComp '11: Proceedings of the 13th internati... | NaN | https://doi.org/10.1145/2030112.2030128 | 2011-09-17 | 276.0 | 2024.0 | We present a recommender for taxi drivers and ... | [https://dl.acm.org/doi/10.1145/304182.304187,... | present recommender taxi driver people expect ... | 0.106132 | 0.101100 | 0.123165 | 0.043080 | 0.098161 |
| 168 | https://dl.acm.org/doi/10.1145/2487575.2487616 | 2 | Urban computing with taxicabs | [Yu Zheng, Yanchi Liu, Jing Yuan, Xing Xie] | UbiComp '11: Proceedings of the 13th internati... | NaN | https://doi.org/10.1145/2030112.2030126 | 2011-09-17 | 413.0 | 3122.0 | Urban computing for city planning is one of th... | [https://dl.acm.org/doi/10.5555/645484.656550,... | urban computing city planning significant appl... | 0.107280 | 0.131390 | 0.085192 | 0.048406 | 0.077540 |
169 rows × 18 columns
data['Total score'] = data[[
'Custom method',
'Flag method',
'Count method',
'Tf-Idf method',
'Word2Vec method',
]].sum(axis=1)
data.sort_values(by='Total score', ascending=False)[[
'title',
'published',
'citation',
'metric',
'abstract',
'Total score',
]].head(15)
| title | published | citation | metric | abstract | Total score | |
|---|---|---|---|---|---|---|
| 0 | Demand driven store site selection via multipl... | 2016-10-31 | 26.0 | 617.0 | Choosing a good location when opening a new st... | 4.312190 |
| 7 | Optimal network location queries | 2010-11-02 | 21.0 | 199.0 | Given a set S of sites and a set O of weighted... | 1.165115 |
| 30 | Progressive computation of the min-dist optima... | 2006-09-01 | 36.0 | 312.0 | This paper proposes and solves the min-dist op... | 1.148950 |
| 159 | A taxi business intelligence system | 2011-08-21 | 54.0 | 1288.0 | The increasing availability of large-scale loc... | 1.148390 |
| 31 | MaxFirst for MaxBRkNN | 2011-04-11 | 26.0 | 0.0 | The MaxBRNN problem finds a region such that s... | 1.039957 |
| 79 | Location-based and preference-aware recommenda... | 2012-11-06 | 503.0 | 3713.0 | The popularity of location-based social networ... | 0.993410 |
| 65 | Semi-supervised document retrieval | 2009-05-01 | 21.0 | 0.0 | This paper proposes a new machine learning met... | 0.965735 |
| 13 | Trade area analysis using user generated mobil... | 2013-05-13 | 57.0 | 693.0 | In this paper, we illustrate how User Generate... | 0.961256 |
| 22 | The optimal-location query | 2005-08-22 | 38.0 | 0.0 | We propose and solve the optimal-location quer... | 0.956050 |
| 125 | Privacy-friendly business models for location-... | 2011-08-01 | 7.0 | 0.0 | This paper presents a theoretical model to ana... | 0.954832 |
| 9 | Geo-spotting: mining online location-based ser... | 2013-08-11 | 193.0 | 2151.0 | The problem of identifying the optimal locatio... | 0.952292 |
| 6 | Exploiting geographic dependencies for real es... | 2014-08-24 | 81.0 | 1076.0 | It is traditionally a challenge for home buyer... | 0.894539 |
| 29 | Efficient methods for finding influential loca... | 2011-10-24 | 30.0 | 207.0 | Given a set S of servers and a set C of client... | 0.890261 |
| 80 | Learning to rank using gradient descent | 2005-08-07 | 1734.0 | 7323.0 | We investigate using gradient descent methods ... | 0.874327 |
| 135 | Mining significant semantic locations from GPS... | 2010-09-01 | 254.0 | 1734.0 | With the increasing deployment and use of GPS-... | 0.866132 |
data.to_excel('analysis_results/analysis_articles.xlsx', index=False)
Визуализация векторов с помощью PCA¶
def get_pca_visualization(dataframe: pd.DataFrame, method: str):
fig = px.scatter_3d(
dataframe,
x='Component 0',
y='Component 1',
z='Component 2',
hover_data=['title', 'published', 'citation', 'metric', 'Total score'],
size='Total score processed',
size_max=40,
color='Total score processed',
color_continuous_scale=px.colors.diverging.Spectral_r,
height=800,
width=1100,
title=f'{method} vectorization',
)
fig.show()
vectors = {
'Flag': flag_vectorization,
'Count': count_vectorization,
'Tf-Idf': tfidf_vectorization,
'Word2Vec': word2vec_vectorization,
}
decompositions = dict()
pca = PCA(n_components=3, random_state=42)
for method, vector in vectors.items():
decomposition = pca.fit_transform(vector)
decomposition = pd.DataFrame(decomposition)
decomposition.columns = [f'Component {column}' for column in decomposition.columns]
decomposition = pd.concat(
[
data[[
'title',
'published',
'citation',
'metric',
'Total score',
]],
decomposition,
],
axis=1,
)
decomposition['Total score processed'] = decomposition['Total score'].mask(
decomposition['Total score'] > decomposition['Total score'].quantile(0.992),
decomposition['Total score'].quantile(0.992) * 1.05
)
decompositions[method] = decomposition
for method, decomposition in decompositions.items():
get_pca_visualization(decomposition, method)